In [91]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, KBinsDiscretizer, FunctionTransformer
from sklearn.impute import KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

### 1. Load train and test data

In [92]:
# load training data
full = pd.read_csv('../data/train.csv', sep=',', index_col=0)
# split it in X and y data
X = full.drop(['Survived'], axis=1) # drop 'Survived' as column
y = full['Survived'] # select only the 'Survived' column

In [93]:
# load test data
full_test = pd.read_csv('../data/test.csv', delimiter=',', index_col=0)
Xtest = full_test

### 2. Feature engineering / data preprocessor

In [94]:
# feature engineering for numericals
age_feature = ["Age"]
# create a sequential pipeline for Age Feature
age_transformer = make_pipeline(
    KNNImputer(), 
    KBinsDiscretizer(n_bins=15, strategy='quantile', encode='onehot')
    )

In [95]:
# feature engineering for categorical variables
categorical_features = ['Sex', 'Pclass']
# create a transformer for categorical values
categorical_transformer = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')
    

In [96]:
# Instantiate Scaler for Parch and SibSp
scale_features = ['Parch', 'SibSp']
scale_transformer = MinMaxScaler()


In [97]:
# function to extract info from name. Will be used via Functiontransformer
def title_extraction(df):
    '''extracts the title from the name feature,
    clusters it into 4 categories: 0: special title (e.g. Colonel, Baron), 1: Mr, 2: Mrs, 3: Miss
    and returns a dataframe with the title class'''
    # extract the 1. part of the 1st name 
    title = df['Name'].str.split(',', expand = True)
    title = title[title.columns[1]].str.split('.', expand = True)
    title = title[title.columns[0]]
    title = title.str.strip()
    # transform series to dataframe and add column-title
    df_title = pd.DataFrame(data=title)
    df_title.columns=['Title']
    # group the extracted titles
    df_title['Title_no'] = 0
    df_title.loc[df_title['Title'] == 'Mr', 'Title_no'] = 1
    df_title.loc[df_title['Title'] == 'Mrs', 'Title_no'] = 2
    df_title.loc[df_title['Title'] == 'Miss', 'Title_no'] = 3
    df_title.drop(['Title'], axis= 1, inplace=True)
    return df_title

# create transformer for title
title_feature = ['Name']
title_transformer = make_pipeline(
    FunctionTransformer(title_extraction), 
    OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')
)

In [98]:
# - NOT USED - feature engineering for relatives - NOT USED -
# add the column for 'realtives'
# X['Relatives'] = X['SibSp'] + X['Parch']
# Xtest['Relatives'] = Xtest['SibSp'] + Xtest['Parch']
# relative_feature = ['Relatives']
# relative_transformer = KBinsDiscretizer(n_bins=5, strategy='kmeans', encode='onehot')

In [99]:
# -- NOT USED -- Fare Transformer -- NOT USED --
# fare_feature = ['Fare']
# fare_transformer = make_pipeline(
#     SimpleImputer(strategy="mean"), 
#     KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile')
# )

In [100]:
# create the transformer preprocessor (ColumnTransformer works parallel)
preprocessor = ColumnTransformer(
    transformers=[
        ("age", age_transformer, age_feature),
        ("cat", categorical_transformer, categorical_features),
        ('scale', scale_transformer, scale_features),
        ('title', title_transformer, title_feature),
    ],
    remainder='drop')

### 3. Pipeline with RFC Model

In [101]:
# create the model pipeline (works sequential)
pipeline = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=130, criterion="gini", max_depth=15))


In [102]:
# split data to train and test
Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.35, random_state=42)
# fit the preprocessor and train model
pipeline.fit(Xtrain, ytrain)
pipeline.score(Xtrain, ytrain), pipeline.score(Xval, yval)



(0.8929188255613126, 0.8365384615384616)

### 4. Create test predictions and Kaggle upload file

In [103]:
# import the test dataset 
predict_test = pipeline.predict(Xtest)
df_predict = pd.DataFrame(data=predict_test, columns=['Survived'], dtype='int')
df_predict.head(5)

Unnamed: 0,Survived
0,0
1,0
2,0
3,0
4,0


In [104]:
# create a dataset of passenger ID and predictions
full_test.reset_index(inplace=True)
full_test = full_test[['PassengerId']]

In [105]:
# merge the data
final_result = pd.merge(full_test, df_predict, left_index=True, right_index=True)
# create upload file
final_result.to_csv('final_result.csv', index=False)