In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
path = '../../../../titanic.csv'

df = pd.read_csv(path)

df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
def prep_titanic(df):
    df.drop(['Unnamed: 0', 'passenger_id', 'embarked', 'pclass', 'deck'], axis=1, inplace=True)
    
    df.embark_town.fillna('Southampton', inplace=True)
    
    dummy1 = pd.get_dummies(df['sex'], dummy_na=False, drop_first=True)
    
    dummy2 = pd.get_dummies(df['embark_town'], dummy_na=False, drop_first=False)
    
    dummy3 = pd.get_dummies(df['class'], dummy_na=False, drop_first=False)
    
    df.drop(['sex', 'embark_town', 'class'], axis=1, inplace=True)
    
    df = pd.concat([df, dummy1, dummy2, dummy3], axis=1)
    
    return df

In [4]:
df = prep_titanic(df)

df.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,male,Cherbourg,Queenstown,Southampton,First,Second,Third
0,0,22.0,1,0,7.25,0,1,0,0,1,0,0,1
1,1,38.0,1,0,71.2833,0,0,1,0,0,1,0,0
2,1,26.0,0,0,7.925,1,0,0,0,1,0,0,1
3,1,35.0,1,0,53.1,0,0,0,0,1,1,0,0
4,0,35.0,0,0,8.05,1,1,0,0,1,0,0,1


In [5]:
seed = 42

train, valid = train_test_split(df, test_size=0.2, random_state=seed,
                                stratify=df['survived'])

In [6]:
train.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,male,Cherbourg,Queenstown,Southampton,First,Second,Third
692,1,,0,0,56.4958,1,1,0,0,1,0,0,1
481,0,,0,0,0.0,1,1,0,0,1,0,1,0
527,0,,0,0,221.7792,1,1,0,0,1,1,0,0
855,1,18.0,0,1,9.35,0,0,0,0,1,0,0,1
801,1,31.0,1,1,26.25,0,0,0,0,1,0,1,0


In [7]:
mms_age = MinMaxScaler()

In [8]:
train['age'] = mms_age.fit_transform(train[['age']])

train.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,male,Cherbourg,Queenstown,Southampton,First,Second,Third
692,1,,0,0,56.4958,1,1,0,0,1,0,0,1
481,0,,0,0,0.0,1,1,0,0,1,0,1,0
527,0,,0,0,221.7792,1,1,0,0,1,1,0,0
855,1,0.22091,0,1,9.35,0,0,0,0,1,0,0,1
801,1,0.384267,1,1,26.25,0,0,0,0,1,0,1,0


In [9]:
mms_fare = MinMaxScaler()

In [10]:
train['fare'] = mms_fare.fit_transform(train[['fare']])

train.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,male,Cherbourg,Queenstown,Southampton,First,Second,Third
692,1,,0,0,0.110272,1,1,0,0,1,0,0,1
481,0,,0,0,0.0,1,1,0,0,1,0,1,0
527,0,,0,0,0.432884,1,1,0,0,1,1,0,0
855,1,0.22091,0,1,0.01825,0,0,0,0,1,0,0,1
801,1,0.384267,1,1,0.051237,0,0,0,0,1,0,1,0


In [11]:
valid['age'] = mms_age.transform(valid[['age']])
valid['fare'] = mms_fare.transform(valid[['fare']])

In [12]:
valid.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,male,Cherbourg,Queenstown,Southampton,First,Second,Third
565,0,0.296306,2,0,0.047138,0,1,0,0,1,0,0,1
160,0,0.547625,0,1,0.031425,0,1,0,0,1,0,0,1
553,1,0.271174,0,0,0.014102,1,1,1,0,0,0,0,1
860,0,0.509927,2,0,0.027538,0,1,0,0,1,0,0,1
241,1,,1,0,0.030254,0,0,0,1,0,0,0,1


In [13]:
X_train = train.drop(columns=['survived'])
y_train = train['survived']

X_valid = valid.drop(columns=['survived'])
y_valid = valid['survived']

In [14]:
D_train = xgb.DMatrix(X_train, label = y_train)
D_valid = xgb.DMatrix(X_valid, label = y_valid)

In [15]:
params = {
    'verbosity': 1,
    'max_depth': 6,
    'objective': 'binary:logistic',
    'eta': 0.15,
    'random_state': seed
    }

steps = 100

In [16]:
model = xgb.train(params, D_train, steps,
                  evals=[(D_train, 'Train'), (D_valid, 'Valid')],
                  early_stopping_rounds=2)

[0]	Train-logloss:0.61424	Valid-logloss:0.63287
[1]	Train-logloss:0.55506	Valid-logloss:0.58662
[2]	Train-logloss:0.50858	Valid-logloss:0.55286
[3]	Train-logloss:0.47107	Valid-logloss:0.52662
[4]	Train-logloss:0.44140	Valid-logloss:0.50949
[5]	Train-logloss:0.41584	Valid-logloss:0.49613
[6]	Train-logloss:0.39517	Valid-logloss:0.48643
[7]	Train-logloss:0.37859	Valid-logloss:0.48107
[8]	Train-logloss:0.36348	Valid-logloss:0.47452
[9]	Train-logloss:0.35038	Valid-logloss:0.46783
[10]	Train-logloss:0.34084	Valid-logloss:0.46277
[11]	Train-logloss:0.33023	Valid-logloss:0.45874
[12]	Train-logloss:0.32226	Valid-logloss:0.45481
[13]	Train-logloss:0.31432	Valid-logloss:0.45477
[14]	Train-logloss:0.30751	Valid-logloss:0.45115
[15]	Train-logloss:0.30174	Valid-logloss:0.45036
[16]	Train-logloss:0.29553	Valid-logloss:0.45248
[17]	Train-logloss:0.28995	Valid-logloss:0.45647


In [17]:
model.attributes()

{'best_iteration': '15',
 'best_ntree_limit': '16',
 'best_score': '0.45035559833882244'}

In [18]:
model.get_fscore()

{'age': 164.0,
 'sibsp': 26.0,
 'parch': 14.0,
 'fare': 179.0,
 'alone': 1.0,
 'male': 18.0,
 'Cherbourg': 13.0,
 'Queenstown': 1.0,
 'Southampton': 12.0,
 'First': 9.0,
 'Second': 4.0,
 'Third': 22.0}

In [19]:
model.num_boosted_rounds()

18

In [20]:
y_hat = model.predict(D_valid)

In [21]:
y_hat = np.where(y_hat >= 0.5, 1, 0)

y_hat

array([0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0])

In [22]:
print(classification_report(y_valid, y_hat))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       110
           1       0.78      0.65      0.71        69

    accuracy                           0.79       179
   macro avg       0.79      0.77      0.77       179
weighted avg       0.79      0.79      0.79       179



In [23]:
classifier = XGBClassifier(n_estimators=100)

In [24]:
classifier.fit(X_train, y_train, early_stopping_rounds=2,
               eval_set=[(X_valid, y_valid)])

[0]	validation_0-logloss:0.58379
[1]	validation_0-logloss:0.52510
[2]	validation_0-logloss:0.49327
[3]	validation_0-logloss:0.47691
[4]	validation_0-logloss:0.46717
[5]	validation_0-logloss:0.46227
[6]	validation_0-logloss:0.46107
[7]	validation_0-logloss:0.45007
[8]	validation_0-logloss:0.45137




XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

# Light GBM on train dataset

In [25]:
path2 = '../../data/prepared/'

df = pd.read_csv(path2 + 'train_data.csv')

df.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,...,,,,0.002427,0.003706,0.003818,,0.000569,0.00061,0.002674
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,...,,,,0.003954,0.003167,0.005032,,0.009576,0.005492,0.009217
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,...,,,,0.003269,0.007329,0.000427,,0.003429,0.006986,0.002603
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0.002455,0.013683,1.0027,0.001373,0.117169,0.000685,0.005531,...,,,,0.006117,0.004516,0.0032,,0.008419,0.006527,0.0096
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,...,,,,0.003671,0.004946,0.008889,,0.00167,0.008126,0.009827


In [26]:
df_labels = pd.read_csv(path2 + 'train_labels.csv')

df_labels.head()

Unnamed: 0,customer_ID,target
0,e039c846adcb1bfd543d89b8beba6a0f1a2ecdf835cd22...,0
1,8a96ff0aacb31ab4aab4cd15f13e024c04f986b8926789...,0
2,cfe6ef5945ef9c04937bb1826ec350ae7f2d173867f7fe...,0
3,9b677e823a94355cf65d2b97930dd59fce82cbf69e82b8...,1
4,d48853a03d0d463a4b058d129453daaf530160ccc4f0c2...,0


In [27]:
column_list = list(df.columns)

In [28]:
categorical_columns = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
                       'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [29]:
numerical_columns = [col for col in column_list if col not in categorical_columns]

In [30]:
numerical_columns.remove('customer_ID')

In [31]:
numerical_columns.remove('S_2')

In [32]:
numerical_columns[:5]

['P_2', 'D_39', 'B_1', 'B_2', 'R_1']

In [33]:
df.fillna(-2, inplace=True)

In [34]:
df.drop(columns=['S_2'], inplace=True)

In [35]:
df['customer_ID'].unique()

array(['0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a',
       '0005ef1480ffe4f0e5908eae07db8e05f2e705bca6a57db3b97d1da2eb6c7cab',
       '0005f2ab0cc352cd64eac9e60b981ed8c3289d20fa5fcef78973c468916fdfc9',
       ...,
       '8a1936a47736358988fdae1ce7c13d188d2adffd7a0d61d63d858ab7d1bef3d5',
       '8a196ee007f388857f00de28afe60ec8cbe8b06b45702ff7d23f6dd5fa9b785b',
       '8a19b2dd05e40a719a975d1c9e29ebab9d7f6603cfe17a418c22236d4891e270'],
      dtype=object)

In [36]:
df.head()

Unnamed: 0,customer_ID,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,-2.0,...,-2.0,-2.0,-2.0,0.002427,0.003706,0.003818,-2.0,0.000569,0.00061,0.002674
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,-2.0,...,-2.0,-2.0,-2.0,0.003954,0.003167,0.005032,-2.0,0.009576,0.005492,0.009217
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,-2.0,...,-2.0,-2.0,-2.0,0.003269,0.007329,0.000427,-2.0,0.003429,0.006986,0.002603
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.960384,0.002455,0.013683,1.0027,0.001373,0.117169,0.000685,0.005531,-2.0,...,-2.0,-2.0,-2.0,0.006117,0.004516,0.0032,-2.0,0.008419,0.006527,0.0096
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,-2.0,...,-2.0,-2.0,-2.0,0.003671,0.004946,0.008889,-2.0,0.00167,0.008126,0.009827


In [37]:
X = pd.DataFrame({'customer_ID': df['customer_ID'].unique()})

In [38]:
X.head()

Unnamed: 0,customer_ID
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...
1,0005ef1480ffe4f0e5908eae07db8e05f2e705bca6a57d...
2,0005f2ab0cc352cd64eac9e60b981ed8c3289d20fa5fce...
3,0007ee1ac8edb6be588c6c8a158d2268e3228c49cf5277...
4,0008ef32824d3067b4d8ab70e7c79fa04c808f4d99194f...


In [39]:
for col in categorical_columns:
    print(f'{col}')

B_30
B_38
D_114
D_116
D_117
D_120
D_126
D_63
D_64
D_66
D_68


In [40]:
for col in numerical_columns:
    mini = df.groupby('customer_ID')[col].min()
    maxi = df.groupby('customer_ID')[col].max()
    med = df.groupby('customer_ID')[col].median()
    std = df.groupby('customer_ID')[col].std()
    
    stats = pd.DataFrame({f'{col}_min': mini, f'{col}_max': maxi,
                       f'{col}_median': med, f'{col}_std': std})
    
    stats.reset_index(drop=True, inplace=True)
    X.reset_index(drop=True, inplace=True)
    
    X = pd.concat([X, stats], axis=1)

In [41]:
X.head()

Unnamed: 0,customer_ID,P_2_min,P_2_max,P_2_median,P_2_std,D_39_min,D_39_max,D_39_median,D_39_std,B_1_min,...,D_143_median,D_143_std,D_144_min,D_144_max,D_144_median,D_144_std,D_145_min,D_145_max,D_145_median,D_145_std
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.86858,0.960384,0.938469,0.024194,0.001082,0.091505,0.002483,0.024444,0.00193,...,0.00581,0.003564,0.00061,0.009616,0.005492,0.002598,0.000995,0.009827,0.006362,0.003294
1,0005ef1480ffe4f0e5908eae07db8e05f2e705bca6a57d...,0.068484,0.18678,0.127632,0.083648,0.031711,0.914384,0.473048,0.624144,1.318354,...,0.002345,0.000995,0.001506,0.009779,0.005643,0.00585,0.003469,0.008167,0.005818,0.003322
2,0005f2ab0cc352cd64eac9e60b981ed8c3289d20fa5fce...,0.837273,0.946487,0.894418,0.030962,0.004118,0.740627,0.413056,0.236028,0.015077,...,0.004801,0.003206,0.002932,0.00983,0.005286,0.00218,0.000746,0.008901,0.006505,0.00267
3,0007ee1ac8edb6be588c6c8a158d2268e3228c49cf5277...,0.365222,0.646557,0.417593,0.081273,0.000303,0.213325,0.006578,0.064191,0.085111,...,0.00301,0.00307,0.000296,0.007334,0.005844,0.002495,0.000517,0.008526,0.004997,0.002519
4,0008ef32824d3067b4d8ab70e7c79fa04c808f4d99194f...,0.752942,0.867515,0.843999,0.03011,0.000355,0.009881,0.007834,0.003731,0.000511,...,0.002951,0.555799,0.000128,0.008566,0.004376,0.00266,-2.0,0.008376,0.004873,0.556017


In [42]:
X.shape

(21069, 709)

In [43]:
X = X.merge(df_labels, on='customer_ID')

In [44]:
train_X = X.drop(columns=['customer_ID', 'target'])
train_y = X['target']

In [45]:
import lightgbm as lgb

In [46]:
train_data = lgb.Dataset(train_X, label=train_y)

In [47]:
num_round = 10

param = {'objective': 'regression', 'metric': 'binary_logloss'}

In [None]:
bst = lgb.train(param, train_data, num_round)