In [0]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [0]:
df_train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/titanic/data/train.csv')
df_test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/titanic/data/test.csv')

In [3]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [0]:
# df_train.Parch.unique()  #number of parents on board unique values

In [0]:
# df_train.SibSp.unique() # number of sibling spouse on board unique values

In [0]:
# df_train[ ["Survived","Parch","SibSp","Name"]] 

In [0]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str.find(big_string, substring) != -1:
            return substring
    print(big_string)
    return np.nan

In [0]:
#setting silly values to nan
df_test['Fare'] = df_test['Fare'].map(lambda x: np.nan if x==0 else x)
df_train['Fare'] = df_train['Fare'].map(lambda x: np.nan if x==0 else x)

In [9]:
np.where(np.isnan(df_test['Fare'].values))

(array([152, 266, 372]),)

In [10]:
np.where(np.isnan(df_train['Fare'].values))

(array([179, 263, 271, 277, 302, 413, 466, 481, 597, 633, 674, 732, 806,
        815, 822]),)

In [0]:
# setting nan in fare to mean values
meanTrainFare=np.mean(df_train.Fare)
meanTestFare=np.mean(df_test.Fare)
df_train.Fare=df_train.Fare.fillna(meanTrainFare)
df_test.Fare=df_test.Fare.fillna(meanTestFare)

In [12]:
np.where(np.isnan(df_test['Fare'].values))

(array([], dtype=int64),)

In [13]:
np.where(np.isnan(df_train['Fare'].values))

(array([], dtype=int64),)

In [0]:
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

In [0]:
df_train['Title']=df_train['Name'].map(lambda x: substrings_in_string(x, title_list))
df_test['Title']=df_test['Name'].map(lambda x: substrings_in_string(x, title_list))

In [0]:
#replacing all titles with mr, mrs, miss, master
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
    
df_train['Title']=df_train.apply(replace_titles, axis=1)
df_test['Title']=df_test.apply(replace_titles, axis=1)

In [0]:
meanTrainAge=np.mean(df_train.Age)
meanTestAge=np.mean(df_test.Age)
df_train.Age=df_train.Age.fillna(meanTrainAge)
df_test.Age=df_test.Age.fillna(meanTestAge)

In [0]:
#Turning cabin number into Deck
df_train.Cabin = df_train.Cabin.fillna('Unknown')
df_test.Cabin = df_test.Cabin.fillna('Unknown')

cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']

df_train['Deck']=df_train['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
df_test['Deck']=df_test['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

In [0]:
#Creating new family_size column
df_train['Family_Size']=df_train['SibSp']+df_train['Parch']
df_test['Family_Size']=df_test['SibSp']+df_test['Parch']

#Creating fare per person column
df_train['Fare_Per_Person']=df_train['Fare']/(df_train['Family_Size']+1)
df_test['Fare_Per_Person']=df_test['Fare']/(df_test['Family_Size']+1)

#Creating age*class column
df_train['Age*Class']=df_train['Age']*df_train['Pclass']
df_test['Age*Class']=df_test['Age']*df_test['Pclass']

In [0]:
#df_train['Family_Size'].unique()

In [0]:
#df_train['Fare_Per_Person'].unique()

In [0]:
#df_train['Age*Class'].unique()

In [23]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Deck',
       'Family_Size', 'Fare_Per_Person', 'Age*Class'],
      dtype='object')

In [24]:
df_test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Deck', 'Family_Size',
       'Fare_Per_Person', 'Age*Class'],
      dtype='object')

In [0]:
#factorizing values of _cat columns
df_train['Sex_cat'] = df_train['Sex'].factorize()[0]
df_test['Sex_cat'] = df_test['Sex'].factorize()[0]

df_train['Title_cat'] = df_train['Title'].factorize()[0]
df_test['Title_cat'] = df_test['Title'].factorize()[0]

df_train['Deck_cat'] = df_train['Deck'].factorize()[0]
df_test['Deck_cat'] = df_test['Deck'].factorize()[0]

In [26]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Deck',
       'Family_Size', 'Fare_Per_Person', 'Age*Class', 'Sex_cat', 'Title_cat',
       'Deck_cat'],
      dtype='object')

In [0]:
feats = ['Pclass', 'Sex_cat', 'Family_Size', 'Fare_Per_Person', 'Age*Class', 'Deck_cat', 'Title_cat']

In [0]:
X_train = df_train[feats].values
y_train = df_train['Survived'].values
X_test = df_test[feats].values

In [0]:
model = DecisionTreeClassifier(max_depth=10)
model.fit(X_train, y_train)
DecissionTreePrediction = model.predict(X_test)

In [0]:
# Saving submission file
df_prediciton = pd.DataFrame(df_test['PassengerId'])
df_prediciton['Survived']=[x for x in DecissionTreePrediction]
df_prediciton.to_csv('/content/drive/My Drive/Colab Notebooks/titanic/data/DecissionTreePrediction.csv', index=False)

In [0]:
model = RandomForestClassifier(max_depth=35, n_estimators=100)
model.fit(X_train, y_train)
RandomForestPrediction = model.predict(X_test)

In [0]:
# Saving submission file
df_prediciton2 = pd.DataFrame(df_test['PassengerId'])
df_prediciton2['Survived']=[x for x in RandomForestPrediction]
df_prediciton2.to_csv('/content/drive/My Drive/Colab Notebooks/titanic/data/RandomForestPrediction.csv', index=False)