In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from statistics import mean

In [15]:
df = pd.read_csv('../data/raw/train.csv')
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [16]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
X = df.drop('Survived', axis=1)
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [18]:
X_train.shape, X_test.shape

((668, 11), (223, 11))

In [19]:
X_train.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            132
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          519
Embarked         2
dtype: int64

In [20]:
class ColumnSelector(BaseEstimator, TransformerMixin):

    def __init__(self, dtype):
        self.dtype = dtype
    
    def fit(self, X, y=None):
        X = X if isinstance(X, pd.DataFrame) else pd.DataFrame(X)
        if self.dtype == 'numerical':
            self.cols = X.select_dtypes(exclude='O').columns.tolist()
        elif self.dtype == 'categorical':
            self.cols = X.select_dtypes(include='O').columns.tolist()
        return self

    def transform(self, X):
        return X.loc[:, self.cols]

In [21]:
class ColumnDropperTransformer():
    def __init__(self,columns):
        self.columns=columns
        
    def transform(self,X,y=None):
        return X.drop(self.columns,axis=1)
    
    def fit(self, X, y=None):
        return self

In [22]:
num_pipe = Pipeline([
    ('drop_nums', ColumnDropperTransformer(['PassengerId'])),
    ('num_selector', ColumnSelector('numerical')),
    ('num_imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler()),
])
     
cat_pipe = Pipeline([
    ('drop_cats', ColumnDropperTransformer(['Name','Ticket'])),
    ('cat_selector', ColumnSelector('categorical')),
    ('cat_imputeter', SimpleImputer(strategy='constant', fill_value='None')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])
     
pre_processing = FeatureUnion([
    ('num_pipe', num_pipe),
    ('cat_pipe', cat_pipe)
])

In [45]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

modelrf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

modelmlp = MLPClassifier(random_state=42,
                         solver='adam',
                         activation='relu',
                         alpha=1e-5,
                         learning_rate_init=1e-3,
                         hidden_layer_sizes=(10,2),
                         max_iter=5000)

modelxgb = XGBClassifier(learning_rate=0.02, n_estimators=750,
                         max_depth= 3, min_child_weight= 1, 
                         colsample_bytree= 0.6, gamma= 0.0,
                         objective = 'count:poisson',
                         use_label_encoder=False,
                         reg_alpha= 0.001, subsample= 0.8,
                         random_state=42)

In [46]:
estimator_mlp = Pipeline([
    ('pre_proc', pre_processing),
    ('model', modelmlp)
])

estimator_rf = Pipeline([
    ('pre_proc', pre_processing),
    ('model', modelrf)
])

estimator_xgb = Pipeline([
    ('pre_proc', pre_processing),
    ('model', modelxgb)
])

In [47]:
accuracies_cv_mlp = cross_val_score(estimator_mlp, X_train, y_train, scoring='accuracy')
mlp_accuracy = mean(accuracies_cv_mlp)

accuracies_cv_rf = cross_val_score(estimator_rf, X_train, y_train, scoring='accuracy')
rf_accuracy = mean(accuracies_cv_rf)

accuracies_cv_xgb = cross_val_score(estimator_xgb, X_train, y_train, scoring='accuracy')
xgb_accuracy = mean(accuracies_cv_xgb)

mlp_accuracy, rf_accuracy, xgb_accuracy


(0.8024015262035686, 0.8144764897317922, 0.832353271237796)