<h1>Mean Shift Applied to Titanic Dataset</h1>

In [1]:
import numpy as np
from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing, model_selection
import pandas as pd
import matplotlib.pyplot as plt

* __Pclass__ Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
* __survival__ Survival (0 = No; 1 = Yes)
* __name__ Name
* __sex__ Sex
* __age__ Age
* __sibsp__ Number of Siblings/Spouses Aboard
* __parch__ Number of Parents/Children Aboard
* __ticket__ Ticket Number
* __fare__ Passenger Fare (British pound)
* __cabin__ Cabin
* __embarked__ Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
* __boat__ Lifeboat
* __body__ Body Identification Number
* __home.dest__ Home/Destination

In [2]:
# https://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls
df = pd.read_excel('titanic.xls')

original_df = pd.DataFrame.copy(df)
df.drop(['body','name'], 1, inplace=True)
df.fillna(0,inplace=True)

In [3]:
def handle_non_numerical_data(df):
    
    # handling non-numerical data: must convert.
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        #print(column,df[column].dtype)
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            
            column_contents = df[column].values.tolist()
            #finding just the uniques
            unique_elements = set(column_contents)
            # great, found them. 
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    # creating dict that contains new
                    # id per unique string
                    text_digit_vals[unique] = x
                    x+=1
            # now we map the new "id" vlaue
            # to replace the string. 
            df[column] = list(map(convert_to_int,df[column]))
            
    return df

In [4]:
df = handle_non_numerical_data(df)
df.drop(['ticket','home.dest'], 1, inplace=True)

In [5]:
X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

In [6]:
clf = MeanShift()
clf.fit(X)

MeanShift()

In [7]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_

In [8]:
original_df['cluster_group'] = np.nan

In [10]:
for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [11]:
n_clusters_ = len(np.unique(labels))
survival_rates = {}

for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i)) ]
    # print(temp_df.head())
    
    survival_cluster = temp_df[  (temp_df['survived'] == 1) ]
    
    survival_rate = len(survival_cluster) / len(temp_df)
    # print(i,survival_rate)
    survival_rates[i] = survival_rate

print(survival_rates)

{0: 0.358974358974359, 1: 1.0, 2: 0.1, 3: 0.5925925925925926, 4: 0.7586206896551724}


In [12]:
print(original_df[ (original_df['cluster_group']==1) ])

     pclass  survived                                               name  \
35        1         1                           Bowen, Miss. Grace Scott   
49        1         1                 Cardeza, Mr. Thomas Drake Martinez   
50        1         1  Cardeza, Mrs. James Warburton Martinez (Charlo...   
183       1         1                             Lesurer, Mr. Gustave J   
302       1         1                                   Ward, Miss. Anna   

        sex   age  sibsp  parch    ticket      fare        cabin embarked  \
35   female  45.0      0      0  PC 17608  262.3750          NaN        C   
49     male  36.0      0      1  PC 17755  512.3292  B51 B53 B55        C   
50   female  58.0      0      1  PC 17755  512.3292  B51 B53 B55        C   
183    male  35.0      0      0  PC 17755  512.3292         B101        C   
302  female  35.0      0      0  PC 17755  512.3292          NaN        C   

    boat  body                                       home.dest  cluster_group  


In [13]:
print(original_df[ (original_df['cluster_group']==0) ].describe())

            pclass     survived         age        sibsp        parch  \
count  1209.000000  1209.000000  949.000000  1209.000000  1209.000000   
mean      2.383788     0.358974   28.903319     0.486352     0.315964   
std       0.791052     0.479898   14.012125     1.061840     0.669401   
min       1.000000     0.000000    0.166700     0.000000     0.000000   
25%       2.000000     0.000000   20.000000     0.000000     0.000000   
50%       3.000000     0.000000   27.000000     0.000000     0.000000   
75%       3.000000     1.000000   36.000000     1.000000     0.000000   
max       3.000000     1.000000   80.000000     8.000000     4.000000   

              fare        body  cluster_group  
count  1208.000000  108.000000         1209.0  
mean     23.886651  159.000000            0.0  
std      27.144400   98.958974            0.0  
min       0.000000    1.000000            0.0  
25%       7.895800   69.750000            0.0  
50%      13.000000  155.500000            0.0  
75%   

In [14]:
print(original_df[ (original_df['cluster_group']==2) ].describe())

       pclass   survived        age      sibsp      parch       fare  \
count    10.0  10.000000   8.000000  10.000000  10.000000  10.000000   
mean      3.0   0.100000  39.875000   0.800000   6.000000  42.703750   
std       0.0   0.316228   1.552648   0.421637   1.632993  15.590194   
min       3.0   0.000000  38.000000   0.000000   5.000000  29.125000   
25%       3.0   0.000000  39.000000   1.000000   5.000000  31.303125   
50%       3.0   0.000000  39.500000   1.000000   5.000000  35.537500   
75%       3.0   0.000000  40.250000   1.000000   6.000000  46.900000   
max       3.0   1.000000  43.000000   1.000000   9.000000  69.550000   

             body  cluster_group  
count    2.000000           10.0  
mean   234.500000            2.0  
std    130.814755            0.0  
min    142.000000            2.0  
25%    188.250000            2.0  
50%    234.500000            2.0  
75%    280.750000            2.0  
max    327.000000            2.0  


In [15]:
cluster_0 = (original_df[ (original_df['cluster_group']==0) ])
cluster_0_fc = (cluster_0[ (cluster_0['pclass']==1) ])
print(cluster_0_fc.describe())

       pclass    survived         age       sibsp       parch        fare  \
count   235.0  235.000000  196.000000  235.000000  235.000000  235.000000   
mean      1.0    0.582979   38.964286    0.357447    0.238298   59.673067   
std       0.0    0.494119   14.138373    0.514634    0.533715   39.683415   
min       1.0    0.000000    4.000000    0.000000    0.000000    0.000000   
25%       1.0    0.000000   28.000000    0.000000    0.000000   28.500000   
50%       1.0    1.000000   39.000000    0.000000    0.000000   52.000000   
75%       1.0    1.000000   49.000000    1.000000    0.000000   79.200000   
max       1.0    1.000000   80.000000    2.000000    2.000000  227.525000   

             body  cluster_group  
count   24.000000          235.0  
mean   161.750000            0.0  
std     84.233242            0.0  
min     16.000000            0.0  
25%    109.750000            0.0  
50%    167.500000            0.0  
75%    235.250000            0.0  
max    307.000000         