In [90]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [6]:
path = '../../../../titanic.csv'

df = pd.read_csv(path)

df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [7]:
def prep_titanic(df):
    df.drop(['Unnamed: 0', 'passenger_id', 'embarked', 'pclass', 'deck'], axis=1, inplace=True)
    
    df.embark_town.fillna('Southampton', inplace=True)
    
    dummy1 = pd.get_dummies(df['sex'], dummy_na=False, drop_first=True)
    
    dummy2 = pd.get_dummies(df['embark_town'], dummy_na=False, drop_first=False)
    
    dummy3 = pd.get_dummies(df['class'], dummy_na=False, drop_first=False)
    
    df.drop(['sex', 'embark_town', 'class'], axis=1, inplace=True)
    
    df = pd.concat([df, dummy1, dummy2, dummy3], axis=1)
    
    return df

In [8]:
df = prep_titanic(df)

df.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,male,Cherbourg,Queenstown,Southampton,First,Second,Third
0,0,22.0,1,0,7.25,0,1,0,0,1,0,0,1
1,1,38.0,1,0,71.2833,0,0,1,0,0,1,0,0
2,1,26.0,0,0,7.925,1,0,0,0,1,0,0,1
3,1,35.0,1,0,53.1,0,0,0,0,1,1,0,0
4,0,35.0,0,0,8.05,1,1,0,0,1,0,0,1


In [63]:
seed = 42

train, valid = train_test_split(df, test_size=0.2, random_state=seed,
                                stratify=df['survived'])

In [64]:
train.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,male,Cherbourg,Queenstown,Southampton,First,Second,Third
692,1,28.0,0,0,56.4958,1,1,0,0,1,0,0,1
481,0,28.0,0,0,0.0,1,1,0,0,1,0,1,0
527,0,28.0,0,0,221.7792,1,1,0,0,1,1,0,0
855,1,18.0,0,1,9.35,0,0,0,0,1,0,0,1
801,1,31.0,1,1,26.25,0,0,0,0,1,0,1,0


In [65]:
mms_age = MinMaxScaler()

In [66]:
train['age'] = mms_age.fit_transform(train[['age']])

train.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,male,Cherbourg,Queenstown,Southampton,First,Second,Third
692,1,0.346569,0,0,56.4958,1,1,0,0,1,0,0,1
481,0,0.346569,0,0,0.0,1,1,0,0,1,0,1,0
527,0,0.346569,0,0,221.7792,1,1,0,0,1,1,0,0
855,1,0.22091,0,1,9.35,0,0,0,0,1,0,0,1
801,1,0.384267,1,1,26.25,0,0,0,0,1,0,1,0


In [67]:
mms_fare = MinMaxScaler()

In [68]:
train['fare'] = mms_fare.fit_transform(train[['fare']])

train.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,male,Cherbourg,Queenstown,Southampton,First,Second,Third
692,1,0.346569,0,0,0.110272,1,1,0,0,1,0,0,1
481,0,0.346569,0,0,0.0,1,1,0,0,1,0,1,0
527,0,0.346569,0,0,0.432884,1,1,0,0,1,1,0,0
855,1,0.22091,0,1,0.01825,0,0,0,0,1,0,0,1
801,1,0.384267,1,1,0.051237,0,0,0,0,1,0,1,0


In [69]:
valid['age'] = mms_age.transform(valid[['age']])
valid['fare'] = mms_fare.transform(valid[['fare']])

In [70]:
valid.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,male,Cherbourg,Queenstown,Southampton,First,Second,Third
565,0,0.296306,2,0,0.047138,0,1,0,0,1,0,0,1
160,0,0.547625,0,1,0.031425,0,1,0,0,1,0,0,1
553,1,0.271174,0,0,0.014102,1,1,1,0,0,0,0,1
860,0,0.509927,2,0,0.027538,0,1,0,0,1,0,0,1
241,1,0.346569,1,0,0.030254,0,0,0,1,0,0,0,1


In [71]:
X_train = train.drop(columns=['survived'])
y_train = train['survived']

X_valid = valid.drop(columns=['survived'])
y_valid = valid['survived']

In [73]:
D_train = xgb.DMatrix(X_train, label = y_train)
D_valid = xgb.DMatrix(X_valid, label = y_valid)

In [149]:
params = {
    'verbosity': 1,
    'max_depth': 8,
    'objective': 'binary:logistic',
    'eta': 0.15,
    'random_state': seed
    }

steps = 100

In [150]:
model = xgb.train(params, D_train, steps,
                  evals=[(D_train, 'Train'), (D_valid, 'Valid')],
                  early_stopping_rounds=2)

[0]	Train-logloss:0.61137	Valid-logloss:0.63404
[1]	Train-logloss:0.55015	Valid-logloss:0.59051
[2]	Train-logloss:0.50242	Valid-logloss:0.55799
[3]	Train-logloss:0.46346	Valid-logloss:0.53399
[4]	Train-logloss:0.43129	Valid-logloss:0.51704
[5]	Train-logloss:0.40393	Valid-logloss:0.50667
[6]	Train-logloss:0.38105	Valid-logloss:0.49758
[7]	Train-logloss:0.36383	Valid-logloss:0.49138
[8]	Train-logloss:0.34588	Valid-logloss:0.48462
[9]	Train-logloss:0.33326	Valid-logloss:0.48145
[10]	Train-logloss:0.32220	Valid-logloss:0.47504
[11]	Train-logloss:0.30994	Valid-logloss:0.47519
[12]	Train-logloss:0.30105	Valid-logloss:0.47461
[13]	Train-logloss:0.29100	Valid-logloss:0.47648
[14]	Train-logloss:0.28355	Valid-logloss:0.47635


In [146]:
y_hat = model.predict(D_valid)

In [152]:
model.get_score(importance_type='weight')

{'age': 200.0,
 'sibsp': 34.0,
 'parch': 17.0,
 'fare': 229.0,
 'alone': 2.0,
 'male': 15.0,
 'Cherbourg': 17.0,
 'Queenstown': 5.0,
 'Southampton': 13.0,
 'First': 9.0,
 'Second': 6.0,
 'Third': 19.0}

In [147]:
y_hat = np.where(y_hat >= 0.5, 1, 0)

y_hat

array([0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0])

In [148]:
print(classification_report(y_valid, y_hat))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       110
           1       0.77      0.67      0.71        69

    accuracy                           0.79       179
   macro avg       0.79      0.77      0.78       179
weighted avg       0.79      0.79      0.79       179

