In [6]:
import numpy as np
from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing, cross_validation
import pandas as pd
import matplotlib.pyplot as plt


'''
Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
survival Survival (0 = No; 1 = Yes)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare (British pound)
cabin Cabin
embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
boat Lifeboat
body Body Identification Number
home.dest Home/Destination
'''


# https://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls
df = pd.read_excel('titanic.xls')

original_df = pd.DataFrame.copy(df)
df.drop(['body','name'], 1, inplace=True)
df.fillna(0,inplace=True)

def handle_non_numerical_data(df):
    
    # handling non-numerical data: must convert.
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        #print(column,df[column].dtype)
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            
            column_contents = df[column].values.tolist()
            #finding just the uniques
            unique_elements = set(column_contents)
            # great, found them. 
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    # creating dict that contains new
                    # id per unique string
                    text_digit_vals[unique] = x
                    x+=1
            # now we map the new "id" vlaue
            # to replace the string. 
            df[column] = list(map(convert_to_int,df[column]))

    return df

df = handle_non_numerical_data(df)
df.drop(['ticket','home.dest'], 1, inplace=True)

X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [20]:
#getting labels and marking them for each point
labels=clf.labels_
cluster_centers=clf.cluster_centers_

In [21]:
original_df['cluster_group']=np.nan
for i in range(len(X)):
    original_df['cluster_group'].iloc[i]=labels[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [22]:
#check for survival rates
n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i)) ]
    #print(temp_df.head())

    survival_cluster = temp_df[  (temp_df['survived'] == 1) ]

    survival_rate = len(survival_cluster) / len(temp_df)
    #print(i,survival_rate)
    survival_rates[i] = survival_rate
    
print(survival_rates)

{0: 0.3703403565640194, 1: 1.0, 2: 0.6428571428571429, 3: 0.4, 4: 0.1}


In [15]:
print(original_df[ (original_df['cluster_group']==1) ])

     pclass  survived                                               name  \
49        1         1                 Cardeza, Mr. Thomas Drake Martinez   
50        1         1  Cardeza, Mrs. James Warburton Martinez (Charlo...   
183       1         1                             Lesurer, Mr. Gustave J   
302       1         1                                   Ward, Miss. Anna   

        sex   age  sibsp  parch    ticket      fare        cabin embarked  \
49     male  36.0      0      1  PC 17755  512.3292  B51 B53 B55        C   
50   female  58.0      0      1  PC 17755  512.3292  B51 B53 B55        C   
183    male  35.0      0      0  PC 17755  512.3292         B101        C   
302  female  35.0      0      0  PC 17755  512.3292          NaN        C   

    boat  body                                       home.dest  cluster_group  
49     3   NaN  Austria-Hungary / Germantown, Philadelphia, PA            1.0  
50     3   NaN                    Germantown, Philadelphia, PA           

In [16]:
print(original_df[ (original_df['cluster_group']==0) ])

      pclass  survived                                              name  \
5          1         1                               Anderson, Mr. Harry   
6          1         1                 Andrews, Miss. Kornelia Theodosia   
7          1         0                            Andrews, Mr. Thomas Jr   
8          1         1     Appleton, Mrs. Edward Dale (Charlotte Lamson)   
9          1         0                           Artagaveytia, Mr. Ramon   
12         1         1                     Aubart, Mme. Leontine Pauline   
13         1         1                      Barber, Miss. Ellen "Nellie"   
14         1         1              Barkworth, Mr. Algernon Henry Wilson   
15         1         0                               Baumann, Mr. John D   
18         1         1                             Bazzani, Miss. Albina   
19         1         0                              Beattie, Mr. Thomson   
20         1         1                     Beckwith, Mr. Richard Leonard   
21         1

In [17]:
print(original_df[ (original_df['cluster_group']==2) ])

      pclass  survived                                               name  \
0          1         1                      Allen, Miss. Elisabeth Walton   
1          1         1                     Allison, Master. Hudson Trevor   
2          1         0                       Allison, Miss. Helen Loraine   
3          1         0               Allison, Mr. Hudson Joshua Creighton   
4          1         0    Allison, Mrs. Hudson J C (Bessie Waldo Daniels)   
10         1         0                             Astor, Col. John Jacob   
11         1         1  Astor, Mrs. John Jacob (Madeleine Talmadge Force)   
16         1         0                           Baxter, Mr. Quigg Edmond   
17         1         1    Baxter, Mrs. James (Helene DeLaudeniere Chaput)   
23         1         1                              Bidois, Miss. Rosalie   
24         1         1                                  Bird, Miss. Ellen   
32         1         1                            Bonnell, Miss. Caroline   

In [23]:
print(original_df[ (original_df['cluster_group']==0) ].describe())


            pclass     survived         age        sibsp        parch  \
count  1234.000000  1234.000000  983.000000  1234.000000  1234.000000   
mean      2.342788     0.370340   29.494659     0.427877     0.293355   
std       0.811515     0.483092   14.262606     0.834224     0.645532   
min       1.000000     0.000000    0.166700     0.000000     0.000000   
25%       2.000000     0.000000   21.000000     0.000000     0.000000   
50%       3.000000     0.000000   28.000000     0.000000     0.000000   
75%       3.000000     1.000000   38.000000     1.000000     0.000000   
max       3.000000     1.000000   80.000000     5.000000     4.000000   

              fare        body  cluster_group  
count  1233.000000  115.000000         1234.0  
mean     24.326564  161.452174            0.0  
std      26.479912   98.333504            0.0  
min       0.000000    1.000000            0.0  
25%       7.895800   71.000000            0.0  
50%      13.000000  165.000000            0.0  
75%   

In [24]:
print(original_df[ (original_df['cluster_group']==3) ].describe())


       pclass  survived        age     sibsp    parch        fare  body  \
count     5.0  5.000000   5.000000  5.000000  5.00000    5.000000   0.0   
mean      1.0  0.400000  52.000000  0.800000  3.20000  252.450000   NaN   
std       0.0  0.547723  15.247951  0.447214  0.83666   22.893879   NaN   
min       1.0  0.000000  27.000000  0.000000  2.00000  211.500000   NaN   
25%       1.0  0.000000  48.000000  1.000000  3.00000  262.375000   NaN   
50%       1.0  0.000000  60.000000  1.000000  3.00000  262.375000   NaN   
75%       1.0  1.000000  61.000000  1.000000  4.00000  263.000000   NaN   
max       1.0  1.000000  64.000000  1.000000  4.00000  263.000000   NaN   

       cluster_group  
count            5.0  
mean             3.0  
std              0.0  
min              3.0  
25%              3.0  
50%              3.0  
75%              3.0  
max              3.0  


In [25]:
#no. of 1st class passengers survived in cluster 0
cluster_0 = (original_df[ (original_df['cluster_group']==0) ])
cluster_0_fc = (cluster_0[ (cluster_0['pclass']==1) ])
print(cluster_0_fc.describe())

       pclass    survived         age       sibsp       parch        fare  \
count   267.0  267.000000  230.000000  267.000000  267.000000  267.000000   
mean      1.0    0.591760   39.967391    0.359551    0.202247   59.035799   
std       0.0    0.492431   13.988533    0.511092    0.486968   35.375335   
min       1.0    0.000000    4.000000    0.000000    0.000000    0.000000   
25%       1.0    0.000000   29.250000    0.000000    0.000000   29.700000   
50%       1.0    1.000000   39.000000    0.000000    0.000000   52.554200   
75%       1.0    1.000000   50.000000    1.000000    0.000000   79.200000   
max       1.0    1.000000   80.000000    2.000000    2.000000  227.525000   

             body  cluster_group  
count   32.000000          267.0  
mean   167.000000            0.0  
std     85.187081            0.0  
min     16.000000            0.0  
25%    109.750000            0.0  
50%    170.500000            0.0  
75%    236.750000            0.0  
max    307.000000         