In [23]:
import pandas as pd
import numpy as np
titanic_data = pd.read_csv(r'train.csv')
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [24]:
# Drop irrelevant columns
titanic_data.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin'],'columns',inplace=True)
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,male,22.0,7.25,S
1,1,1,female,38.0,71.2833,C
2,1,3,female,26.0,7.925,S
3,1,1,female,35.0,53.1,S
4,0,3,male,35.0,8.05,S


In [25]:
from sklearn import preprocessing
# Convert gender to 0 or 1
label_enc =preprocessing.LabelEncoder()
titanic_data['Sex'] = label_enc.fit_transform(titanic_data['Sex'].astype(str))
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,1,22.0,7.25,S
1,1,1,0,38.0,71.2833,C
2,1,3,0,26.0,7.925,S
3,1,1,0,35.0,53.1,S
4,0,3,1,35.0,8.05,S


In [26]:
# One-hot encoding of 'Embarked' with pd.get_dummies
titanic_data = pd.get_dummies(titanic_data,columns=['Embarked'])
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,7.25,0,0,1
1,1,1,0,38.0,71.2833,1,0,0
2,1,3,0,26.0,7.925,0,0,1
3,1,1,0,35.0,53.1,0,0,1
4,0,3,1,35.0,8.05,0,0,1


In [27]:
# Find missing values in the data and drop those rows:
print('rows before drop n/a',len(titanic_data))
bool_matrix = titanic_data.isnull() # dataframe with True and False values for each cell in the titanic_data
only_null_filter = bool_matrix.any(axis=1) # is there a True value in any column in each row. returns a pandas Series with index matching index of titcanic dataframe
missing = titanic_data[only_null_filter] # show all rows that has one or more null values
titanic_data = titanic_data.dropna()
print('rows after',len(titanic_data))
pd.options.display.max_rows = None # let me see all rows in the dataframe (can be used with columns too)

rows before drop n/a 891
rows after 714


In [28]:
# what is the best bandwidth to use for our dataset?
# The smaller values of bandwith result in tall skinny kernels & larger values result in short fat kernels.
from sklearn.cluster import estimate_bandwidth
estimate_bandwidth(titanic_data)

30.388891121761876

In [29]:
# Fit data to meanshift model
from sklearn.cluster import MeanShift
analyzer = MeanShift(bandwidth=30) 
analyzer.fit(titanic_data)

MeanShift(bandwidth=30, bin_seeding=False, cluster_all=True, min_bin_freq=1,
          n_jobs=None, seeds=None)

In [30]:
labels = analyzer.labels_
print(labels)
print('\n\n',np.unique(labels))

[0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 1 1 0 0 0 0 0 0 0 0 0
 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 3 0 0 0 1 0 0
 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 3 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0
 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 1 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 4 0 0 1 0 0 0 0 2 2 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 2 3 0 2 2 0 1 1 3 0 0 0 0 0 0 2 2 0 0
 0 0 2 0 0 0 1 0 2 0 1 2 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1
 0 0 2 0 0 3 0 0 3 0 0 1 1 1 0 0 2 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 0 3 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 1 2 0 0 0 0 1 2 0 0 1 0
 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 2 1 1 0 0 0 2 0 0 0 0 2 0 0 0 0 1 1
 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 2 0 1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0
 2 0 0 1 0 0 0 0 0 1 0 0 

In [31]:
#We will add a new column in dataset which shows the cluster the data of a particular row belongs to.

titanic_data['cluster_group'] = np.nan
data_length=len(titanic_data)
for i in range(data_length): # loop 714 rows
    titanic_data.iloc[i,titanic_data.columns.get_loc('cluster_group')] = labels[i] #set the cluster label on each row

titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,cluster_group
0,0,3,1,22.0,7.25,0,0,1,0.0
1,1,1,0,38.0,71.2833,1,0,0,1.0
2,1,3,0,26.0,7.925,0,0,1,0.0
3,1,1,0,35.0,53.1,0,0,1,1.0
4,0,3,1,35.0,8.05,0,0,1,0.0


In [32]:
titanic_data.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,cluster_group
count,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0
mean,0.406162,2.236695,0.634454,29.699118,34.694514,0.182073,0.039216,0.77591,0.315126
std,0.49146,0.83825,0.481921,14.526497,52.91893,0.386175,0.194244,0.417274,0.690647
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,20.125,8.05,0.0,0.0,1.0,0.0
50%,0.0,2.0,1.0,28.0,15.7417,0.0,0.0,1.0,0.0
75%,1.0,3.0,1.0,38.0,33.375,0.0,0.0,1.0,0.0
max,1.0,3.0,1.0,80.0,512.3292,1.0,1.0,1.0,4.0


In [33]:
#Grouping passengers by Cluster
titanic_cluster_data = titanic_data.groupby(['cluster_group']).mean()
#Count of passengers in each cluster
titanic_cluster_data['Counts'] = pd.Series(titanic_data.groupby(['cluster_group']).size())
titanic_cluster_data

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,Counts
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.0,0.336918,2.52509,0.679211,28.25672,15.434139,0.121864,0.046595,0.831541,558
1.0,0.611111,1.296296,0.527778,36.148148,65.622688,0.333333,0.018519,0.62963,108
2.0,0.733333,1.0,0.366667,32.430667,131.183883,0.5,0.0,0.5,30
3.0,0.733333,1.0,0.266667,30.333333,239.99194,0.533333,0.0,0.466667,15
4.0,1.0,1.0,0.666667,35.333333,512.3292,1.0,0.0,0.0,3
