In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [26]:
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
all_df = pd.concat([df, test_df], sort=True).reset_index(drop=True)
IDs = test_df['PassengerId']
all_df.isnull().sum()


Age             263
Cabin          1014
Embarked          2
Fare              1
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
dtype: int64

In [27]:
#finding out the correlation between age and other factors
all_df_corr = all_df.corr().abs().unstack().sort_values(kind='quicksort', ascending=False).reset_index()
all_df_corr.rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation Coefficient'}, inplace=True)
all_df_corr[all_df_corr['Feature 1'] == 'Age']


Unnamed: 0,Feature 1,Feature 2,Correlation Coefficient
0,Age,Age,1.0
9,Age,Pclass,0.408106
18,Age,SibSp,0.243699
21,Age,Fare,0.17874
26,Age,Parch,0.150917
30,Age,Survived,0.077221
41,Age,PassengerId,0.028814


In [28]:
#age is most linked to pclass, hence replace median age with respect to gender and pclass
age_pclass_sex = all_df.groupby(['Sex', 'Pclass']).median()['Age']
age_pclass_sex

Sex     Pclass
female  1         36.0
        2         28.0
        3         22.0
male    1         42.0
        2         29.5
        3         25.0
Name: Age, dtype: float64

In [29]:
all_df['Age'] = all_df.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

In [30]:
all_df[all_df['Embarked'].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
61,38.0,B28,,80.0,"Icard, Miss. Amelie",0,62,1,female,0,1.0,113572
829,62.0,B28,,80.0,"Stone, Mrs. George Nelson (Martha Evelyn)",0,830,1,female,0,1.0,113572


In [31]:
#Googled their names, both embarked from Southampton
all_df['Embarked'] = all_df['Embarked'].fillna('S')

In [32]:
all_df[all_df['Fare'].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
1043,60.5,,S,,"Storey, Mr. Thomas",0,1044,3,male,0,,3701


In [33]:
#Man with missing fare was a man travelling alone in 3rd class
thomas_fare = all_df.groupby(['Pclass','Parch','SibSp']).Fare.median()[3][0][0]
all_df['Fare'] = all_df['Fare'].fillna(thomas_fare)

In [34]:
all_df['Title'] = all_df['Name'].str.extract(' ([A-Za-z]+)\.')
all_df['Title'].value_counts()

Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Ms            2
Major         2
Mlle          2
Dona          1
Lady          1
Countess      1
Mme           1
Sir           1
Capt          1
Jonkheer      1
Don           1
Name: Title, dtype: int64

In [35]:
#assigning values to titles
# Mr:0 Miss:1 Mrs:2 Others:3
#women were given priority on lifeboats
titles = {'Mr': 0, 
          'Miss': 1, 
          'Mrs': 2, 
          'Master':3, 'Dr':3, 'Rev':3,'Col':3,'Major':3,"Mlle":3,'Countess':3,'Ms':3, 'Lady':3, 'Jonkheer':3, 'Don':3, 'Dona':3, 'Mme':3,'Capt':3,'Sir':3 }

all_df['Title'] = all_df['Title'].map(titles)
all_df = all_df.drop(['Name'],axis=1)

In [36]:
#making new FamilySize column
all_df['FamilySize'] = all_df['SibSp'] + all_df['Parch']+1

all_df.loc[ all_df['FamilySize'] == 1, 'FamilySize'] = 0                            
all_df.loc[(all_df['FamilySize'] > 1) & (all_df['FamilySize'] <= 4), 'FamilySize'] = 1  
all_df.loc[(all_df['FamilySize'] > 4) & (all_df['FamilySize'] <= 6), 'FamilySize'] = 2  
all_df.loc[all_df['FamilySize']  > 6, 'FamilySize'] = 3                             


In [37]:
#Googled cabins on titanic, cabin decks go from A to G, location of cabin can affect survival
#( A Cabins ae closer to staircase than G Cabins)
#Cabins decks are assigned by passanger class
#Decks A, B, C are only for 1st class
#Decks D and E are for all classes
#Decks F and G are for 2nd and 3rd classes
#Bow of ship sank first, cabins at bow were 3rd class
#Data should be grouped into classes ABC, DE, FG, and unknowns will be U
#Titanic had other boat decks (only 1 T included ) 
#T is put under ABC as passnger is a 1st class 

all_df['Deck'] = all_df['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'U')
all_df['Deck'] = all_df['Deck'].replace(['A', 'B', 'C','T'], 'ABC')
all_df['Deck'] = all_df['Deck'].replace(['D', 'E'], 'DE')
all_df['Deck'] = all_df['Deck'].replace(['F', 'G'], 'FG')


all_df['Deck'].value_counts()





U      1014
ABC     182
DE       87
FG       26
Name: Deck, dtype: int64

In [38]:
#drop Cabin as Deck replaces it
all_df = all_df.drop(['Cabin','Ticket'], axis=1)

In [39]:
all_df.isnull().sum()

Age              0
Embarked         0
Fare             0
Parch            0
PassengerId      0
Pclass           0
Sex              0
SibSp            0
Survived       418
Title            0
FamilySize       0
Deck             0
dtype: int64

In [40]:
label_encoder = preprocessing.LabelEncoder()

columns = ['Sex', 'Embarked','Deck','Survived']

for column in columns:
    all_df[column] = label_encoder.fit_transform(all_df[column])

In [41]:
#splitting data
train_df, test_df = all_df.loc[:890], all_df.loc[891:].drop(['Survived'], axis=1)
X = train_df.drop(['Survived'],axis=1)
y = train_df['Survived']

X_train, X_validate,y_train,y_validate = train_test_split(X, y, test_size=0.2)

In [42]:
#spending too long on trying to tune models and eventually having to resort to google for help
model = RandomForestClassifier(criterion='gini',
                               n_estimators=1750,
                               max_depth=7,
                               min_samples_split=6,
                               min_samples_leaf=6,
                               max_features='auto',
                               oob_score=True,
                               random_state=42,
                               n_jobs=-1,
                               verbose=1) 

In [43]:
classifier = model.fit(X_train,y_train)
predictions = classifier.predict(X_validate)
accuracy_score(y_validate,predictions)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:    2.3s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 1226 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 1750 out of 1750 | elapsed:    0.2s finished


0.88268156424581

In [44]:
classifier = model.fit(X,y)
submission_predictions = classifier.predict(test_df)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:    2.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 1226 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 1750 out of 1750 | elapsed:    0.2s finished


In [45]:
submission_df = pd.DataFrame({'PassengerId':IDs.values,
                              'Survived':submission_predictions})
submission_df.to_csv('submission_pain.csv',index=False)

In [46]:
submission_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
