In [1]:
cd '/content/drive/My Drive/Colab Notebooks/titanic'

/content/drive/My Drive/Colab Notebooks/titanic


In [2]:
ls

 [0m[01;34mdata[0m/  'Titanic 2.ipynb'   Titanic.ipynb


In [0]:
import pandas as pd
import numpy as np
import random

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/dataworkshop/webinar-titanic/master/input/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**Exploring data**

In [5]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
df.Parch.unique()

array([0, 1, 2, 5, 3, 4, 6])

In [7]:
df.SibSp.unique()

array([1, 0, 3, 4, 2, 5, 8])

In [8]:
df.Cabin.unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [9]:
df.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [0]:
#df.Embarked.values

**Feature engineering**

In [0]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str.find(big_string, substring) != -1:
            return substring
    print(big_string)
    return np.nan

In [0]:
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

In [0]:
df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list))

In [0]:
#replacing all titles with mr, mrs, miss, master
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
    
df['Title']=df.apply(replace_titles, axis=1)

In [0]:
# setting nan in age to mean values
meanAge=np.mean(df.Age)
df.Age=df.Age.fillna(meanAge)

In [0]:
#setting silly values to nan
df['Fare'] = df['Fare'].map(lambda x: np.nan if x==0 else x)

In [0]:
# setting nan in fare to mean values
meanFare=np.mean(df.Fare)
df.Fare=df.Fare.fillna(meanFare)

In [0]:
#Turning cabin number into Deck
df.Cabin = df.Cabin.fillna('Unknown')

cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']

df['Deck']=df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

In [0]:
#Creating new family_size column
df['Family_Size']=df['SibSp']+df['Parch']

#Creating fare per person column
df['Fare_Per_Person']=df['Fare']/(df['Family_Size']+1)

#Creating age*class column
df['Age*Class']=df['Age']*df['Pclass']

In [0]:
#factorizing values of _cat columns
df['Sex_cat'] = df['Sex'].factorize()[0]
df['Title_cat'] = df['Title'].factorize()[0]
df['Deck_cat'] = df['Deck'].factorize()[0]

In [0]:
# Filling embarked nan with random port
port_letters = ['S', 'C', 'Q']
df['Embarked'] = df['Embarked'].fillna(random.choice(port_letters))

In [0]:
# Factorizing embarked column
df['Embarked_cat'] = df['Embarked'].factorize()[0]

**Columns and features**

In [23]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Deck',
       'Family_Size', 'Fare_Per_Person', 'Age*Class', 'Sex_cat', 'Title_cat',
       'Deck_cat', 'Embarked_cat'],
      dtype='object')

In [0]:
feats0 = ['Pclass']
feats1 = ['Pclass', 'Sex_cat',]
feats2 = ['Pclass', 'Sex_cat', 'Family_Size']
feats3 = ['Pclass', 'Sex_cat', 'Family_Size', 'Fare_Per_Person']
feats4 = ['Pclass', 'Sex_cat', 'Family_Size', 'Fare_Per_Person', 'Age*Class']
feats5 = ['Pclass', 'Sex_cat', 'Family_Size', 'Fare_Per_Person', 'Age*Class', 'Deck_cat']
feats6 = ['Pclass', 'Sex_cat', 'Family_Size', 'Fare_Per_Person', 'Age*Class', 'Deck_cat', 'Title_cat']
feats7 = ['Pclass', 'Sex_cat', 'Family_Size', 'Fare_Per_Person', 'Age*Class', 'Deck_cat', 'Title_cat', 'Embarked_cat']
feats8 = ['Pclass', 'Sex_cat', 'Family_Size', 'Fare_Per_Person', 'Age', 'Deck_cat', 'Title_cat', 'Embarked_cat',] 
feats9 = ['Pclass', 'Sex_cat', 'Family_Size', 'Fare_Per_Person', 'Age*Class', 'Deck_cat', 'Title_cat', 'Embarked_cat', 'Age']
feats10 = ['Sex_cat', 'Family_Size', 'Fare_Per_Person', 'Age*Class', 'Deck_cat', 'Title_cat', 'Embarked_cat']

feats_range = [feats0,feats1,feats2,feats3,feats4,feats5,feats6,feats7,feats8,feats9,feats10]

**Models**

In [0]:
def different_feats(model,feats_range):
  feats_names = ['feats'+str(i) for i,feat in enumerate(feats_range)]
  accuracy_scores = []
  for i,feat in enumerate(feats_range):
    #setting the data
    X = df[feat].values
    y = df['Survived'].values
    #dividing data into test and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    #training model
    model.fit(X_train, y_train)
    #predicting values
    y_pred = model.predict(X_test)
    #adding accuracy score
    accuracy_scores.append(round(accuracy_score(y_test, y_pred),6))
    #prinitng accuracy score
    print(feats_names[i], "accuracy = ", round(accuracy_score(y_test, y_pred),6))
  #printing the best prediction and feats
  print('\n\nThe best accuracy score is', max(accuracy_scores), 'which is with feats',accuracy_scores.index(max(accuracy_scores)))

DecisionTreeClassifier

In [26]:
model = DecisionTreeClassifier(max_depth=10)
different_feats(model,feats_range)

feats0 accuracy =  0.682836
feats1 accuracy =  0.772388
feats2 accuracy =  0.828358
feats3 accuracy =  0.798507
feats4 accuracy =  0.839552
feats5 accuracy =  0.768657
feats6 accuracy =  0.828358
feats7 accuracy =  0.776119
feats8 accuracy =  0.817164
feats9 accuracy =  0.798507
feats10 accuracy =  0.83209


The best accuracy score is 0.839552 which is with feats 4


RandomForestClassifier

In [27]:
model = RandomForestClassifier(max_depth=35, n_estimators=100)
different_feats(model,feats_range)

feats0 accuracy =  0.664179
feats1 accuracy =  0.764925
feats2 accuracy =  0.779851
feats3 accuracy =  0.80597
feats4 accuracy =  0.80597
feats5 accuracy =  0.817164
feats6 accuracy =  0.783582
feats7 accuracy =  0.813433
feats8 accuracy =  0.798507
feats9 accuracy =  0.820896
feats10 accuracy =  0.798507


The best accuracy score is 0.820896 which is with feats 9
