In [26]:
import numpy as np
from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing, cross_validation
import pandas as pd
import matplotlib.pyplot as plt

In [27]:
df = pd.read_excel('titanic.xls')
#for referencing the non-numerical part
original_df = pd.DataFrame.copy(df)
df.drop(['body','name'], 1, inplace=True)
df.fillna(0,inplace=True)

In [28]:
def handle_non_numerical_data(df):#Mapping for non-numeric data
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1

            df[column] = list(map(convert_to_int, df[column]))

    return df

In [29]:
#handling out the non-numeric part
df = handle_non_numerical_data(df)
df.drop(['ticket','home.dest'], 1, inplace=True)
print(df.head())

   pclass  survived  sex      age  sibsp  parch      fare  cabin  embarked  \
0       1         1    1  29.0000      0      0  211.3375    175         3   
1       1         1    0   0.9167      1      2  151.5500    143         3   
2       1         0    1   2.0000      1      2  151.5500    143         3   
3       1         0    0  30.0000      1      2  151.5500    143         3   
4       1         0    1  25.0000      1      2  151.5500    143         3   

   boat  
0     1  
1    19  
2     0  
3     0  
4     0  


In [30]:
#classification Time!!
X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [31]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_
original_df['cluster_group']=np.nan
for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [32]:
#check the survival rates for each of the groups we happen to find
n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i)) ]
    #print(temp_df.head())

    survival_cluster = temp_df[  (temp_df['survived'] == 1) ]

    survival_rate = len(survival_cluster) / len(temp_df)
    #print(i,survival_rate)
    survival_rates[i] = survival_rate
    
print(survival_rates)

{0: 0.3700404858299595, 1: 0.6415094339622641, 2: 1.0, 3: 0.1, 4: 1.0, 5: 0.5714285714285714}


In [33]:
#dive deeper
print(original_df[ (original_df['cluster_group']==1) ])

      pclass  survived                                               name  \
0          1         1                      Allen, Miss. Elisabeth Walton   
1          1         1                     Allison, Master. Hudson Trevor   
2          1         0                       Allison, Miss. Helen Loraine   
3          1         0               Allison, Mr. Hudson Joshua Creighton   
4          1         0    Allison, Mrs. Hudson J C (Bessie Waldo Daniels)   
10         1         0                             Astor, Col. John Jacob   
11         1         1  Astor, Mrs. John Jacob (Madeleine Talmadge Force)   
16         1         0                           Baxter, Mr. Quigg Edmond   
17         1         1    Baxter, Mrs. James (Helene DeLaudeniere Chaput)   
23         1         1                              Bidois, Miss. Rosalie   
24         1         1                                  Bird, Miss. Ellen   
28         1         1                             Bissette, Miss. Amelia   

In [34]:
#check for group zero
print(original_df[ (original_df['cluster_group']==0) ].describe())

            pclass     survived         age        sibsp        parch  \
count  1235.000000  1235.000000  984.000000  1235.000000  1235.000000   
mean      2.341700     0.370040   29.453506     0.429150     0.297166   
std       0.812086     0.483011   14.258923     0.834114     0.647862   
min       1.000000     0.000000    0.166700     0.000000     0.000000   
25%       2.000000     0.000000   21.000000     0.000000     0.000000   
50%       3.000000     0.000000   28.000000     0.000000     0.000000   
75%       3.000000     1.000000   38.000000     1.000000     0.000000   
max       3.000000     1.000000   80.000000     5.000000     4.000000   

              fare        body  cluster_group  
count  1234.000000  115.000000         1235.0  
mean     24.552984  161.452174            0.0  
std      27.131538   98.333504            0.0  
min       0.000000    1.000000            0.0  
25%       7.895800   71.000000            0.0  
50%      13.000000  165.000000            0.0  
75%   

In [36]:
#check for group one
print(original_df[ (original_df['cluster_group']==1) ].describe())

          pclass   survived        age     sibsp      parch        fare  \
count  53.000000  53.000000  43.000000  53.00000  53.000000   53.000000   
mean    1.339623   0.641509  32.753877   2.09434   1.056604  175.035457   
std     0.758120   0.484146  15.177508   2.82342   0.907556   69.963427   
min     1.000000   0.000000   0.916700   0.00000   0.000000   69.550000   
25%     1.000000   0.000000  23.000000   0.00000   0.000000  120.000000   
50%     1.000000   1.000000  31.000000   1.00000   1.000000  211.337500   
75%     1.000000   1.000000  44.000000   2.00000   2.000000  227.525000   
max     3.000000   1.000000  67.000000   8.00000   2.000000  263.000000   

             body  cluster_group  
count    4.000000           53.0  
mean   105.500000            1.0  
std     30.468563            0.0  
min     67.000000            1.0  
25%     88.750000            1.0  
50%    110.000000            1.0  
75%    126.750000            1.0  
max    135.000000            1.0  
