# Clustering Analysis

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans

## Demographic Clustering

In this section, I will perform a cluster analysis using the fields unrelated below:

* age
* gender
* engagement
* account.age


# Reading in the data

In [2]:
cluster_df = pd.read_csv("washington_survey_data.txt",sep="\t")

In [3]:
cluster_df= cluster_df.replace('Highly Engaged', 'Highly_Engaged')
cluster_df= cluster_df.replace('Not Engaged', 'Not_Engaged')


In [4]:
#cluster_df

# Convert the data to numeric

In [5]:
cluster_df.gender = cluster_df.gender.map(dict(other=0,male=1, female=2))

In [6]:
cluster_df.engagement = cluster_df.engagement.map(dict(Not_Engaged=0, Engaged=1, Highly_Engaged=2))

In [7]:
#cluster_df

# Normalize the Data

In [8]:
required_cols = ['age','gender', 'engagement', 'account.age']

In [9]:
numeric_df = cluster_df.loc[:,required_cols]

In [10]:
#numeric_df

In [11]:
numeric_df_normalized =(numeric_df-numeric_df.min())/(numeric_df.max()-numeric_df.min())

In [12]:
#numeric_df_normalized

# Building our K Means Model

In [13]:
km_1 = KMeans(
    n_clusters=3, init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0
)

In [14]:
clustering_1 = km_1.fit_predict(numeric_df_normalized)

In [15]:
cluster_df['cluster'] = clustering_1

# Results

In [29]:
for cluster in set(clustering_1) :
    print(f'Printing Results for Cluster {cluster}.\n\n')
    
    print(cluster_df.query(f"cluster == {cluster}"))
    
    print("\n\n")

Printing Results for Cluster 0.


         id  age  gender  engagement  mem.edu      zip channel  progressivism  \
0       346   56       0           1        2  98503.0  Branch      -0.083333   
2       349   71       1           0        7  98506.0  Branch       1.000000   
3       352   66       1           1        7      NaN  Branch       0.583333   
6       367   49       1           1        4      NaN  Branch       1.791667   
7       368   43       1           0        6  99163.0  Branch       2.500000   
...     ...  ...     ...         ...      ...      ...     ...            ...   
2391  19904   55       1           0        5  99163.0  Branch       0.666667   
2401  20181   71       1           1        8  98103.0  Branch       0.791667   
2413  21907   48       1           1        3  99006.0  Branch       0.166667   
2414  21962   49       1           0        3  98596.0  Branch      -0.083333   
2419  23167   33       1           1        6      NaN  Branch       0.2083

## Values Clustering

Similarly to the previous section, perform a cluster analysis, this time on the values questions:

* fair
* harm
* in.group
* authority
* purity
* public.sector
* sustainability
* localism

After you’ve built your clusters, report the following information on each cluster:

* Predominant region
* Average age and account age
* Most common focal value
* Mean results on the questions of `pub.greater.priv`, `experience.more.important`, and `teachers.underpaid`. 

# Read in the data

In [34]:
cluster_df_2 = pd.read_csv("washington_survey_data.txt",sep="\t")
cluster_df_2.rename(columns = {'public.sector':'public_sector'}, inplace = True)


# Convert Data to Numeric

In [36]:
cluster_df_2.public_sector = cluster_df_2.public_sector.map(dict(no=0,yes=1))

# Normalize our data

In [37]:
required_cols_2 = ['fair','harm', 'in.group', 'authority', 'purity', 'public_sector', 'sustainability', 'localism']

In [38]:
numeric_df_2 = cluster_df_2.loc[:,required_cols_2]

In [39]:
numeric_df_2_norm =(numeric_df_2-numeric_df_2.min())/(numeric_df_2.max()-numeric_df_2.min())

# Building K Means Model

In [42]:
km_2 = KMeans(
    n_clusters=3, init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0
)

In [43]:
clustering_2 = km_2.fit_predict(numeric_df_2_norm)

In [44]:
cluster_df_2['cluster'] = clustering_2

# Results

In [45]:
for cluster in set(clustering_2) :
    print(f'Printing Results for Cluster {cluster}.\n\n')
    
    print(cluster_df_2.query(f"cluster == {cluster}"))
    
    print("\n\n")

Printing Results for Cluster 0.


         id  age  gender      engagement  mem.edu      zip channel  \
4       358   50    male  Highly Engaged        4  98233.0  Branch   
5       361   40  female         Engaged        3  98520.0  Branch   
8       369   61    male  Highly Engaged        3  98133.0  Branch   
19      429   46    male  Highly Engaged        5      NaN  Branch   
25      437   43  female     Not Engaged        4      NaN  Branch   
...     ...  ...     ...             ...      ...      ...     ...   
2410  21767   25  female  Highly Engaged        5      NaN  Branch   
2413  21907   48    male         Engaged        3  99006.0  Branch   
2415  22002   71    male  Highly Engaged        5  99224.0  Branch   
2416  22070   83  female         Engaged        6  99163.0  Branch   
2419  23167   33    male         Engaged        6      NaN  Branch   

      progressivism  harm  fair  ...          region  public_sector  \
4         -0.500000  2.00  4.50  ...  W WA Non Metro  

# Predominant Region Per Cluster

In [49]:
cluster_0 = cluster_df_2[cluster_df_2['cluster'] == 0]
result_0 = cluster_0['region'].mode()
print(result_0)

0      Thurston
1    W WA Metro
Name: region, dtype: object


In [50]:
cluster_1 = cluster_df_2[cluster_df_2['cluster'] == 1]
result_1 = cluster_1['region'].mode()
print(result_1)

0    W WA Metro
Name: region, dtype: object


In [51]:
cluster_2 = cluster_df_2[cluster_df_2['cluster'] == 2]
result_2 = cluster_2['region'].mode()
print(result_2)

0    W WA Metro
Name: region, dtype: object


# Average Age Per Cluster

In [53]:
result_0 = cluster_0['age'].mean()
result_0

55.9779792746114

In [54]:
result_1 = cluster_1['age'].mean()
result_1

52.61685214626391

In [55]:
result_2 = cluster_2['age'].mean()
result_2

45.41470588235294

# Most Common Focal Value Per Cluster

In [57]:
result_0 = cluster_0['main.focal.value'].mode()
result_0

0    Health (i.e. cancer research)
Name: main.focal.value, dtype: object

In [58]:
result_1 = cluster_1['main.focal.value'].mode()
result_1

0    Environment
Name: main.focal.value, dtype: object

In [59]:
result_2 = cluster_2['main.focal.value'].mode()
result_2

0    Education
Name: main.focal.value, dtype: object