In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline

In [3]:
df = pd.read_csv('train.csv')

In [92]:
def preprocess_df(df, testing=True, reindexer=None):
    
    df['Ticket'] = df['Ticket'].str.replace('\.|/', '')
    temp_ticket = df['Ticket'].str.extract('(\w*)?\s?(.*)')
    for index, row in temp_ticket.iterrows():
    
        if not row[1]:

            temp_ticket.iloc[index][1] = row[0]
            temp_ticket.iloc[index][0] = ''

    temp_ticket = pd.concat([temp_ticket[1], 
                             pd.get_dummies(temp_ticket[0])[temp_ticket[0].value_counts().keys()[1:6].tolist()]], 
                            axis=1)
    
    temp_ticket[1] = temp_ticket[1].str.extract('(\d{2,})')
    temp_ticket[1] = temp_ticket[1].fillna(0).astype(int)
    temp_ticket.rename(columns={1:"TicketID"}, inplace=True)
    
    df = pd.concat([df, temp_ticket], axis=1)
    
    mean_age_by_gender = df.groupby('Sex').mean()['Age'].to_dict()
    null_age = df.loc[df['Age'].isna(), :].Sex.map(mean_age_by_gender)
    df.loc[null_age.index, 'Age'] = null_age
    df.loc[:, 'InCabin'] = ~df['Cabin'].isna()
    
    df['FamilyAboard'] = df['Parch'] + df['SibSp']
    
    scaler = StandardScaler()
    
    #feats=['Age', 'Fare']
    
    #for feat in feats:
        
        #df[feat] = scaler.fit_transform(np.array(df['Age']).reshape(-1,1))
     
    class_dummies = pd.get_dummies(df['Pclass'])
    class_dummies_names = ['class_' + str(x) for x in class_dummies.columns.to_list()]
    df[class_dummies_names] = class_dummies
    df[['female', 'male']] = pd.get_dummies(df['Sex'])
    
    if testing:
        
        df_filtered = df.drop(df[df['Embarked'].isna()].index)
        
    drop_columns = ['PassengerId', 'Sex', 'Ticket', 'Cabin', 'Name', 'Embarked', 'Pclass', 'SibSp', 'Parch']
    
    df = df.fillna(0)
    
    df_filtered = df.drop(drop_columns, axis=1)
    
    if reindexer is not None:
        
        df_filtered = df_filtered.T.reindex(reindexer.columns, fill_value=0).T
        
    
    return df_filtered

In [97]:
df_filtered = preprocess_df(df)
X = df_filtered.drop('Survived', axis=1)
y = df_filtered['Survived']

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15144)

In [107]:
rf = RandomForestClassifier(max_depth=5, max_features='auto', n_estimators=000)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

0.8609550561797753

In [108]:
rf.score(X_test, y_test)

0.8659217877094972

In [109]:
np.mean(cross_val_score(rf, X, y, cv=10))

0.8249395641811372

# Submission

In [110]:
df_test = pd.read_csv('test.csv')

In [113]:
df_filtered = preprocess_df(df_test, testing=False, reindexer=df_filtered)
predictions = rf.predict(df_filtered)

ValueError: Number of features of the model must match the input. Model n_features is 15 and input n_features is 12 

In [67]:
df_test['Survived'] = predictions

In [68]:
df_test[['PassengerId', 'Survived']].to_csv('submission.csv', index=False)