In [1]:
import numpy as np
import pandas as pd
import os

### Load the file into a dataframe

In [3]:
FILE_NAME = 'BartRider.csv'
dat_rider = pd.read_csv(FILE_NAME)

### Inspect the top rows

In [9]:
dat_rider.tail(5)

Unnamed: 0,Age,DistToWork,DualInc,Education,Gender,Income,Language,NbrInHouseHold,NbrInHouseholdUnder18,OwnRent,Rider,YrsInArea
5488,3,11,Y,4,M,5,English,2,0,Own,Yes,5
5489,4,9,Y,3,F,6,English,3,1,Rent,Yes,2
5490,2,7,N,4,M,2,English,1,1,Rent,Yes,1
5491,3,7,Y,3,F,9,English,2,0,Own,No,5
5492,2,10,N,4,F,1,English,4,2,Parent,Yes,2


### Dummy code the categoric variables

In [11]:
dat_rider = pd.concat([dat_rider, pd.get_dummies(dat_rider['Gender'], prefix='Gender', drop_first=True)], axis=1)
dat_rider.drop(['Gender'], inplace=True, axis=1)

dat_rider = pd.concat([dat_rider, pd.get_dummies(dat_rider['OwnRent'], prefix='OwnRent', drop_first=True)], axis=1)
dat_rider.drop(['OwnRent'], inplace=True, axis=1)

dat_rider = pd.concat([dat_rider, pd.get_dummies(dat_rider['Language'], prefix='Language', drop_first=True)], axis=1)
dat_rider.drop(['Language'], inplace=True, axis=1)

dat_rider = pd.concat([dat_rider, pd.get_dummies(dat_rider['DualInc'], prefix='DualInc', drop_first=True)], axis=1)
dat_rider.drop(['DualInc'], inplace=True, axis=1)

dat_rider = pd.concat([dat_rider, pd.get_dummies(dat_rider['Rider'], prefix='Rider', drop_first=True)], axis=1)
dat_rider.drop(['Rider'], inplace=True, axis=1)

In [13]:
dat_rider.head()

Unnamed: 0,Age,DistToWork,Education,Income,NbrInHouseHold,NbrInHouseholdUnder18,YrsInArea,Gender_M,OwnRent_Parent,OwnRent_Rent,Language_Other,Language_Spanish,DualInc_Y,Rider_Yes
0,7,14,3,3,1,0,5,False,False,True,False,False,False,True
1,7,10,5,8,2,0,5,True,False,False,False,False,False,False
2,3,9,3,1,1,0,5,True,False,True,False,False,False,True
3,1,13,2,1,5,3,5,True,True,False,False,False,False,True
4,3,14,5,2,3,1,5,False,True,False,False,False,False,True


### Create new dataset without target variable

In [15]:
dat_rider_feats = dat_rider.drop(['Rider_Yes'], axis=1)

In [17]:
dat_rider_feats.head()

Unnamed: 0,Age,DistToWork,Education,Income,NbrInHouseHold,NbrInHouseholdUnder18,YrsInArea,Gender_M,OwnRent_Parent,OwnRent_Rent,Language_Other,Language_Spanish,DualInc_Y
0,7,14,3,3,1,0,5,False,False,True,False,False,False
1,7,10,5,8,2,0,5,True,False,False,False,False,False
2,3,9,3,1,1,0,5,True,False,True,False,False,False
3,1,13,2,1,5,3,5,True,True,False,False,False,False
4,3,14,5,2,3,1,5,False,True,False,False,False,False


### Fit 2 cluster model

In [41]:
# Using scikit-learn to perform K-Means clustering
from sklearn.cluster import KMeans
    
# Specify the number of clusters (2) and fit the data dat_rider_feats
kmeans = KMeans(n_clusters=4, random_state=42).fit(dat_rider_feats)

In [43]:
print(dat_rider_feats.columns)
centroids = (kmeans.cluster_centers_)
print(centroids)

Index(['Age', 'DistToWork', 'Education', 'Income', 'NbrInHouseHold',
       'NbrInHouseholdUnder18', 'YrsInArea', 'Gender_M', 'OwnRent_Parent',
       'OwnRent_Rent', 'Language_Other', 'Language_Spanish', 'DualInc_Y'],
      dtype='object')
[[ 2.28402367  9.35404339  2.99802761  1.85700197  3.52564103  1.14003945
   4.15088757  0.44871795  0.55522682  0.36390533  0.0443787   0.10157791
   0.0591716 ]
 [ 4.113879    9.45966785  4.35468565  7.08659549  2.6316726   0.52253855
   4.37663108  0.47627521  0.09905101  0.25919336  0.02313167  0.02491103
   0.35587189]
 [ 4.10182927 13.57317073  4.34634146  7.26890244  2.81402439  0.63414634
   4.41585366  0.43780488  0.1097561   0.23719512  0.02134146  0.04146341
   0.36707317]
 [ 2.73807459 13.33998265  3.26192541  2.25065048  2.88985256  0.70078057
   4.11708586  0.4848222   0.42758023  0.43712056  0.03902862  0.07892454
   0.06764961]]


In [45]:
labels = (kmeans.labels_)
print(labels[:30])

[3 1 0 3 3 3 0 2 1 1 3 3 3 0 3 0 3 1 3 1 2 1 1 2 0 3 1 2 1 3]


In [47]:
labelList = labels.tolist()
Num_CL_0 = labelList.count(0)
Num_CL_1 = labelList.count(1)
Num_CL_2 = labelList.count(2)

print (Num_CL_0)
print (Num_CL_1)
print (Num_CL_2)

1015
1686
1640


In [49]:
# Calculate silhouette_score
from sklearn.metrics import silhouette_score

print(silhouette_score(dat_rider_feats, kmeans.labels_))

0.18328390081639329
