In [25]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

%matplotlib inline

In [26]:
titanic_data = pd.read_csv('datasets/titanic.csv', quotechar='"')

titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [27]:
titanic_data.drop(['PassengerId','Name','Ticket','Cabin'], axis =1, inplace=True)
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [28]:
from sklearn import preprocessing

labelEncoder = preprocessing.LabelEncoder()
titanic_data['Sex'] = labelEncoder.fit_transform(titanic_data['Sex'].astype(str))

titanic_data.sample(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
842,1,1,0,30.0,0,0,31.0,C
765,1,1,0,51.0,1,0,77.9583,S
800,0,2,1,34.0,0,0,13.0,S
615,1,2,0,24.0,1,2,65.0,S
316,1,2,0,24.0,1,0,26.0,S
889,1,1,1,26.0,0,0,30.0,C
147,0,3,0,9.0,2,2,34.375,S
571,1,1,0,53.0,2,0,51.4792,S
119,0,3,0,2.0,4,2,31.275,S
46,0,3,1,,1,0,15.5,Q


In [29]:
titanic_data = pd.get_dummies(titanic_data, columns=['Embarked'])
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


In [30]:
titanic_data[titanic_data.isnull().any(axis=1)]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
5,0,3,1,,0,0,8.4583,0,1,0
17,1,2,1,,0,0,13.0000,0,0,1
19,1,3,0,,0,0,7.2250,1,0,0
26,0,3,1,,0,0,7.2250,1,0,0
28,1,3,0,,0,0,7.8792,0,1,0
...,...,...,...,...,...,...,...,...,...,...
859,0,3,1,,0,0,7.2292,1,0,0
863,0,3,0,,8,2,69.5500,0,0,1
868,0,3,1,,0,0,9.5000,0,0,1
878,0,3,1,,0,0,7.8958,0,0,1


In [31]:
titanic_data = titanic_data.dropna()

In [32]:
from sklearn.cluster import MeanShift

analyzer = MeanShift(bandwidth=30)
analyzer.fit(titanic_data)

MeanShift(bandwidth=30, bin_seeding=False, cluster_all=True, max_iter=300,
          min_bin_freq=1, n_jobs=None, seeds=None)

In [33]:
from sklearn.cluster import estimate_bandwidth
estimate_bandwidth(titanic_data)

30.44675914497196

In [34]:
labels = analyzer.labels_
np.unique(labels)

array([0, 1, 2, 3, 4])

In [35]:
titanic_data['cluster_group'] = np.nan
data_length = len(titanic_data)
for i in range(data_length):
    titanic_data.iloc[i,titanic_data.columns.get_loc('cluster_group')] = labels[i]

In [36]:
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,cluster_group
0,0,3,1,22.0,1,0,7.25,0,0,1,0.0
1,1,1,0,38.0,1,0,71.2833,1,0,0,1.0
2,1,3,0,26.0,0,0,7.925,0,0,1,0.0
3,1,1,0,35.0,1,0,53.1,0,0,1,1.0
4,0,3,1,35.0,0,0,8.05,0,0,1,0.0


In [37]:
titanic_data.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,cluster_group
count,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0
mean,0.406162,2.236695,0.634454,29.699118,0.512605,0.431373,34.694514,0.182073,0.039216,0.77591,0.315126
std,0.49146,0.83825,0.481921,14.526497,0.929783,0.853289,52.91893,0.386175,0.194244,0.417274,0.690647
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,20.125,0.0,0.0,8.05,0.0,0.0,1.0,0.0
50%,0.0,2.0,1.0,28.0,0.0,0.0,15.7417,0.0,0.0,1.0,0.0
75%,1.0,3.0,1.0,38.0,1.0,1.0,33.375,0.0,0.0,1.0,0.0
max,1.0,3.0,1.0,80.0,5.0,6.0,512.3292,1.0,1.0,1.0,4.0


In [38]:
titanic_cluster_data = titanic_data.groupby(['cluster_group']).mean()
titanic_cluster_data

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,0.336918,2.52509,0.679211,28.25672,0.439068,0.370968,15.434139,0.121864,0.046595,0.831541
1.0,0.611111,1.296296,0.527778,36.148148,0.814815,0.5,65.622688,0.333333,0.018519,0.62963
2.0,0.733333,1.0,0.366667,32.430667,0.6,0.866667,131.183883,0.5,0.0,0.5
3.0,0.733333,1.0,0.266667,30.333333,1.0,1.333333,239.99194,0.533333,0.0,0.466667
4.0,1.0,1.0,0.666667,35.333333,0.0,0.333333,512.3292,1.0,0.0,0.0


In [39]:
titanic_cluster_data['counts']= pd.Series(titanic_data.groupby(['cluster_group']).size())
titanic_cluster_data 

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,counts
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.0,0.336918,2.52509,0.679211,28.25672,0.439068,0.370968,15.434139,0.121864,0.046595,0.831541,558
1.0,0.611111,1.296296,0.527778,36.148148,0.814815,0.5,65.622688,0.333333,0.018519,0.62963,108
2.0,0.733333,1.0,0.366667,32.430667,0.6,0.866667,131.183883,0.5,0.0,0.5,30
3.0,0.733333,1.0,0.266667,30.333333,1.0,1.333333,239.99194,0.533333,0.0,0.466667,15
4.0,1.0,1.0,0.666667,35.333333,0.0,0.333333,512.3292,1.0,0.0,0.0,3


In [40]:
titanic_data[titanic_data['cluster_group'] == 1].describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,cluster_group
count,108.0,108.0,108.0,108.0,108.0,108.0,108.0,108.0,108.0,108.0,108.0
mean,0.611111,1.296296,0.527778,36.148148,0.814815,0.5,65.622688,0.333333,0.018519,0.62963,1.0
std,0.489771,0.645028,0.501555,14.919607,1.086434,0.971558,15.634315,0.473602,0.135445,0.485155,0.0
min,0.0,1.0,0.0,1.0,0.0,0.0,34.6542,0.0,0.0,0.0,1.0
25%,0.0,1.0,0.0,24.0,0.0,0.0,52.5542,0.0,0.0,0.0,1.0
50%,1.0,1.0,1.0,35.0,1.0,0.0,65.0,0.0,0.0,1.0,1.0
75%,1.0,1.0,1.0,48.0,1.0,1.0,78.9375,1.0,0.0,1.0,1.0
max,1.0,3.0,1.0,71.0,5.0,6.0,93.5,1.0,1.0,1.0,1.0


In [23]:
titanic_data[titanic_data['cluster_group'] == 1]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,cluster_group
27,0,1,1,19.0,3,2,263.0,0,0,1,1.0
88,1,1,0,23.0,3,2,263.0,0,0,1,1.0
118,0,1,1,24.0,0,1,247.5208,1,0,0,1.0
195,1,1,0,58.0,0,0,146.5208,1,0,0,1.0
268,1,1,0,58.0,0,1,153.4625,0,0,1,1.0
269,1,1,0,35.0,0,0,135.6333,0,0,1,1.0
297,0,1,0,2.0,1,2,151.55,0,0,1,1.0
299,1,1,0,50.0,0,1,247.5208,1,0,0,1.0
305,1,1,1,0.92,1,2,151.55,0,0,1,1.0
311,1,1,0,18.0,2,2,262.375,1,0,0,1.0
