In [21]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
path = '../../../../titanic.csv'

df = pd.read_csv(path)

df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
def prep_titanic(df):
    df.drop(['Unnamed: 0', 'passenger_id', 'embarked', 'pclass', 'deck'], axis=1, inplace=True)
    
    df.embark_town.fillna('Southampton', inplace=True)
    
    dummy1 = pd.get_dummies(df['sex'], dummy_na=False, drop_first=True)
    
    dummy2 = pd.get_dummies(df['embark_town'], dummy_na=False, drop_first=False)
    
    dummy3 = pd.get_dummies(df['class'], dummy_na=False, drop_first=False)
    
    df.drop(['sex', 'embark_town', 'class'], axis=1, inplace=True)
    
    df = pd.concat([df, dummy1, dummy2, dummy3], axis=1)
    
    return df

In [4]:
df = prep_titanic(df)

df.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,male,Cherbourg,Queenstown,Southampton,First,Second,Third
0,0,22.0,1,0,7.25,0,1,0,0,1,0,0,1
1,1,38.0,1,0,71.2833,0,0,1,0,0,1,0,0
2,1,26.0,0,0,7.925,1,0,0,0,1,0,0,1
3,1,35.0,1,0,53.1,0,0,0,0,1,1,0,0
4,0,35.0,0,0,8.05,1,1,0,0,1,0,0,1


In [5]:
seed = 42

train, valid = train_test_split(df, test_size=0.2, random_state=seed,
                                stratify=df['survived'])

In [6]:
train.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,male,Cherbourg,Queenstown,Southampton,First,Second,Third
692,1,,0,0,56.4958,1,1,0,0,1,0,0,1
481,0,,0,0,0.0,1,1,0,0,1,0,1,0
527,0,,0,0,221.7792,1,1,0,0,1,1,0,0
855,1,18.0,0,1,9.35,0,0,0,0,1,0,0,1
801,1,31.0,1,1,26.25,0,0,0,0,1,0,1,0


In [7]:
mms_age = MinMaxScaler()

In [8]:
train['age'] = mms_age.fit_transform(train[['age']])

train.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,male,Cherbourg,Queenstown,Southampton,First,Second,Third
692,1,,0,0,56.4958,1,1,0,0,1,0,0,1
481,0,,0,0,0.0,1,1,0,0,1,0,1,0
527,0,,0,0,221.7792,1,1,0,0,1,1,0,0
855,1,0.22091,0,1,9.35,0,0,0,0,1,0,0,1
801,1,0.384267,1,1,26.25,0,0,0,0,1,0,1,0


In [9]:
mms_fare = MinMaxScaler()

In [10]:
train['fare'] = mms_fare.fit_transform(train[['fare']])

train.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,male,Cherbourg,Queenstown,Southampton,First,Second,Third
692,1,,0,0,0.110272,1,1,0,0,1,0,0,1
481,0,,0,0,0.0,1,1,0,0,1,0,1,0
527,0,,0,0,0.432884,1,1,0,0,1,1,0,0
855,1,0.22091,0,1,0.01825,0,0,0,0,1,0,0,1
801,1,0.384267,1,1,0.051237,0,0,0,0,1,0,1,0


In [11]:
valid['age'] = mms_age.transform(valid[['age']])
valid['fare'] = mms_fare.transform(valid[['fare']])

In [12]:
valid.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,male,Cherbourg,Queenstown,Southampton,First,Second,Third
565,0,0.296306,2,0,0.047138,0,1,0,0,1,0,0,1
160,0,0.547625,0,1,0.031425,0,1,0,0,1,0,0,1
553,1,0.271174,0,0,0.014102,1,1,1,0,0,0,0,1
860,0,0.509927,2,0,0.027538,0,1,0,0,1,0,0,1
241,1,,1,0,0.030254,0,0,0,1,0,0,0,1


In [13]:
X_train = train.drop(columns=['survived'])
y_train = train['survived']

X_valid = valid.drop(columns=['survived'])
y_valid = valid['survived']

In [14]:
D_train = xgb.DMatrix(X_train, label = y_train)
D_valid = xgb.DMatrix(X_valid, label = y_valid)

In [27]:
params = {
    'verbosity': 1,
    'max_depth': 6,
    'objective': 'binary:logistic',
    'eta': 0.15,
    'random_state': seed
    }

steps = 100

In [28]:
model = xgb.train(params, D_train, steps,
                  evals=[(D_train, 'Train'), (D_valid, 'Valid')],
                  early_stopping_rounds=2)

[0]	Train-logloss:0.61424	Valid-logloss:0.63287
[1]	Train-logloss:0.55506	Valid-logloss:0.58662
[2]	Train-logloss:0.50858	Valid-logloss:0.55286
[3]	Train-logloss:0.47107	Valid-logloss:0.52662
[4]	Train-logloss:0.44140	Valid-logloss:0.50949
[5]	Train-logloss:0.41584	Valid-logloss:0.49613
[6]	Train-logloss:0.39517	Valid-logloss:0.48643
[7]	Train-logloss:0.37859	Valid-logloss:0.48107
[8]	Train-logloss:0.36348	Valid-logloss:0.47452
[9]	Train-logloss:0.35038	Valid-logloss:0.46783
[10]	Train-logloss:0.34084	Valid-logloss:0.46277
[11]	Train-logloss:0.33023	Valid-logloss:0.45874
[12]	Train-logloss:0.32226	Valid-logloss:0.45481
[13]	Train-logloss:0.31432	Valid-logloss:0.45477
[14]	Train-logloss:0.30751	Valid-logloss:0.45115
[15]	Train-logloss:0.30174	Valid-logloss:0.45036
[16]	Train-logloss:0.29553	Valid-logloss:0.45248


In [32]:
model.attributes()

{'best_iteration': '15',
 'best_ntree_limit': '16',
 'best_score': '0.45035559833882244'}

In [35]:
model.get_fscore()

{'age': 164.0,
 'sibsp': 26.0,
 'parch': 14.0,
 'fare': 179.0,
 'alone': 1.0,
 'male': 18.0,
 'Cherbourg': 13.0,
 'Queenstown': 1.0,
 'Southampton': 12.0,
 'First': 9.0,
 'Second': 4.0,
 'Third': 22.0}

In [43]:
model.num_boosted_rounds()

18

In [17]:
y_hat = model.predict(D_valid)

In [19]:
y_hat = np.where(y_hat >= 0.5, 1, 0)

y_hat

array([0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0])

In [20]:
print(classification_report(y_valid, y_hat))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       110
           1       0.78      0.65      0.71        69

    accuracy                           0.79       179
   macro avg       0.79      0.77      0.77       179
weighted avg       0.79      0.79      0.79       179



In [22]:
classifier = XGBClassifier(n_estimators=100)

In [26]:
classifier.fit(X_train, y_train, early_stopping_rounds=2,
               eval_set=[(X_valid, y_valid)])

[0]	validation_0-logloss:0.58379
[1]	validation_0-logloss:0.52510
[2]	validation_0-logloss:0.49327
[3]	validation_0-logloss:0.47691
[4]	validation_0-logloss:0.46717
[5]	validation_0-logloss:0.46227
[6]	validation_0-logloss:0.46107
[7]	validation_0-logloss:0.45007
[8]	validation_0-logloss:0.45137
[9]	validation_0-logloss:0.45551


XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)