In [101]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [102]:
try:
    df = pd.read_csv(r'C:\Users\CodeCave\Documents\DataScience\Titanic_Prediction\train.csv')
except:
    df = pd.read_csv(r'S:\Code\Data Science\Titanic_Prediction\train.csv')

In [104]:
def preprocess_df(df, testing=True):
    
    mean_age_by_gender = df.groupby('Sex').mean()['Age'].to_dict()
    null_age = df.loc[df['Age'].isna(), :].Sex.map(mean_age_by_gender)
    df.loc[null_age.index, 'Age'] = null_age
    df.loc[:, 'InCabin'] = ~df['Cabin'].isna()
    
    scaler = StandardScaler()
    
    feats=['Age', 'Fare']
    
    for feat in feats:
        
        df[feat] = scaler.fit_transform(np.array(df['Age']).reshape(-1,1))
     
    class_dummies = pd.get_dummies(df['Pclass'])
    class_dummies_names = ['class_' + str(x) for x in class_dummies.columns.to_list()]
    df[class_dummies_names] = class_dummies
    df[['female', 'male']] = pd.get_dummies(df['Sex'])
    
    if testing:
        
        df_filtered = df.drop(df[df['Embarked'].isna()].index)
        
    drop_columns = ['PassengerId', 'Sex', 'Ticket', 'Cabin', 'Name', 'Embarked', 'Pclass']
    df_filtered = df_filtered.drop(drop_columns, axis=1)
    
    #if testing:
        
        #df = df.fillna(0)
    
    return df_filtered

In [105]:
df_filtered = preprocess_df(df)
X = df_filtered.drop('Survived', axis=1)
y = df_filtered['Survived']

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15144)

In [107]:
rf = RandomForestClassifier(max_depth=None, max_features=2, min_samples_leaf=2,
                                                         min_weight_fraction_leaf=0, n_estimators=200)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

0.8860759493670886

In [108]:
rf.score(X_test, y_test)

0.7921348314606742

# Submission

In [112]:
df_filtered

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,InCabin,class_1,class_2,class_3,female,male
0,0,-0.594732,1,0,-0.594732,False,0,0,1,0,1
1,1,0.635319,1,0,0.635319,True,1,0,0,1,0
2,1,-0.287220,0,0,-0.287220,False,0,0,1,1,0
3,1,0.404684,1,0,0.404684,True,1,0,0,1,0
4,0,0.404684,0,0,0.404684,False,0,0,1,0,1
5,0,0.076156,0,0,0.076156,False,0,0,1,0,1
6,0,1.865370,0,0,1.865370,True,1,0,0,0,1
7,0,-2.132296,3,1,-2.132296,False,0,0,1,0,1
8,1,-0.210341,0,2,-0.210341,False,0,0,1,1,0
9,1,-1.209758,1,0,-1.209758,False,0,1,0,1,0


In [134]:
try:
    df = pd.read_csv(r'C:\Users\CodeCave\Documents\DataScience\Titanic_Prediction\test.csv')
except:
    df = pd.read_csv(r'S:\Code\Data Science\Titanic_Prediction\test.csv')

In [135]:
df_filtered = preprocess_df(df, testing=False)
predictions = rf.predict(df_filtered)

In [137]:
df['Survived'] = predictions

In [142]:
try:
    df[['PassengerId', 'Survived']].to_csv(r'C:\Users\CodeCave\Documents\DataScience\Titanic_Prediction\submission.csv', 
                                           index=False)
except:
    df[['PassengerId', 'Survived']].to_csv(r'S:\Code\Data Science\Titanic_Prediction\submission.csv',
                                          index=False)
