In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/santander-customer-transaction-prediction/sample_submission.csv
/kaggle/input/santander-customer-transaction-prediction/train.csv
/kaggle/input/santander-customer-transaction-prediction/test.csv


In [2]:
df_train = pd.read_csv("/kaggle/input/santander-customer-transaction-prediction/train.csv")
df_test = pd.read_csv("/kaggle/input/santander-customer-transaction-prediction/test.csv")

1) Use 20k out of 170k negative examples and all 20k positive\
2) Split them on train & val sets\
3) Train model with cv and no cv

In [3]:
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
import xgboost as xgb



# Models with no data preprocessing and feature engineering

In [4]:
X = df_train.sample(200000)
X_train, X_val, y_train, y_val = train_test_split(X.iloc[:,2:], X['target'], stratify=X['target'], test_size=0.25)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5)

In [5]:
%%time
xgb_model = xgb.XGBClassifier(tree_method='hist', early_stopping_rounds=2, device='gpu')
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)

Parameters: { "device" } are not used.

CPU times: user 1min 1s, sys: 275 ms, total: 1min 2s
Wall time: 16.2 s


In [6]:
def validate_model(model, X_val, y_val):
    predictions = model.predict(X_val)    
    metrics = [accuracy_score, f1_score, roc_auc_score,
               precision_score, recall_score]
    for metric in metrics:
        print(f"{metric.__name__}: {metric(y_val, predictions.round())}")

In [7]:
validate_model(xgb_model, X_val, y_val)

accuracy_score: 0.91332
f1_score: 0.366929593923459
roc_auc_score: 0.6182139207129048
precision_score: 0.7032474804031354
recall_score: 0.2482213438735178


## Grid Search hyperparameters

In [8]:
# %%time
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# param_grid = {
#     "max_depth": [4,5,6,7],
#     "learning_rate": [0.2, 0.1, 0.05, 0.01],
#     "subsample": [0.5, 0.7, 1]
# }

# estim = xgb.XGBClassifier(tree_method='gpu_hist', device='gpu')

# grid_search = GridSearchCV(estim, param_grid, scoring='roc_auc', n_jobs=-1, cv=5)
# grid_search.fit(X_train, y_train)

## Random Search hyperparameters

In [9]:
%%time
import scipy.stats as stats

param_dist = {
    "min_child_weight": stats.uniform(0, 5),
    "colsample_bytree": stats.uniform(0.5, 0.5),
    "reg_alpha": stats.uniform(0, 1),
    "learning_rate": stats.uniform(0.01, 0.5)
}

estim = xgb.XGBClassifier(tree_method='gpu_hist')
random_search = RandomizedSearchCV(estim, param_dist, n_iter=25, cv=5, scoring="roc_auc")
random_search.fit(X_train, y_train)

CPU times: user 18min 19s, sys: 9.92 s, total: 18min 29s
Wall time: 5min 49s


## Train Model using best parameters of Grid Search and Random Search

In [10]:
best_p = random_search.best_params_

In [11]:
model = xgb.XGBClassifier(**best_p, tree_method='gpu_hist', early_stopping_rounds=2, n_estimators=1000)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
validate_model(model, X_val, y_val)

accuracy_score: 0.91664
f1_score: 0.3681018799272286
roc_auc_score: 0.6163779197911664
precision_score: 0.7903645833333334
recall_score: 0.2399209486166008


## Try Baesian Optimization (Hyperopt)

In [12]:
from hyperopt import fmin, tpe, hp

space = {
    "min_child_weight": hp.uniform("min_child_weight",0, 5),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
    "reg_alpha": hp.uniform("reg_alpha", 0, 1),
    "learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
    "subsample": hp.uniform("subsample",0.5, 1),
    "max_depth": hp.choice("max_depth", range(3,8))
}

def objective(params):
    xg_model = xgb.XGBClassifier(**params, tree_method='gpu_hist')
    xg_model.fit(X_train, y_train)
    y_pred = xg_model.predict(X_val)
    loss = roc_auc_score(y_val, y_pred)
    return -loss

best_params = fmin(objective, space, algo=tpe.suggest, max_evals=100)

100%|██████████| 100/100 [05:13<00:00,  3.13s/trial, best loss: -0.6590125437342016]


In [13]:
# best_params
model = xgb.XGBClassifier(**best_params, tree_method='gpu_hist', early_stopping_rounds=2, n_estimators=1000)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
validate_model(model, X_test, y_test)

accuracy_score: 0.91848
f1_score: 0.38017031630170317
roc_auc_score: 0.6215461673910926
precision_score: 0.7871536523929471
recall_score: 0.25060144346431434


# Imbalance Data Handling techniques

In [14]:
# we need to try a couple of techniques to handle imbalanced data.
# so we create a function that raises and evaluates the model
def model_testing(parameters, train_set, val_set, n_estimators=200):
    model = xgb.XGBClassifier(**parameters, tree_method='gpu_hist', early_stopping_rounds=2, n_estimators=n_estimators)
    model.fit(train_set[0], train_set[1], eval_set=[val_set], verbose=0)
    
    validate_model(model, val_set[0], val_set[1])
    return model
    
    
def make_submission(model, data):
    ids = data['ID_code']
    preds = model.predict_proba(data.iloc[:, 1:])
    pd.DataFrame(data=zip(ids, preds[:,1]), columns=['ID_code', 'target']).to_csv("submission.csv")

    

## 1) Undersampling

In [15]:
df_train_pos = df_train[df_train['target'] == 1]
df_train_neg = df_train[df_train['target'] == 0].sample(20098)


df_train_reduced = pd.concat((df_train_pos, df_train_neg), axis=0).sample(40196)
X_train, X_val, y_train, y_val = train_test_split(df_train_reduced.iloc[:, 2:], df_train_reduced['target'], test_size=0.5)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5)

In [16]:
space = {
    "min_child_weight": hp.uniform("min_child_weight",0, 5),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
    "reg_alpha": hp.uniform("reg_alpha", 0, 1),
    "learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
    "subsample": hp.uniform("subsample",0.5, 1),
    "max_depth": hp.choice("max_depth", range(3,8))
}

def objective(params):
    xg_model = xgb.XGBClassifier(**params, tree_method='gpu_hist')
    xg_model.fit(X_train, y_train)
    y_pred = xg_model.predict(X_val)
    loss = roc_auc_score(y_val, y_pred)
    return -loss

best_params = fmin(objective, space, algo=tpe.suggest, max_evals=100)

100%|██████████| 100/100 [01:10<00:00,  1.42trial/s, best loss: -0.776473069741317]


In [17]:
model = model_testing(best_params, (X_train, y_train), (X_val, y_val), 1000)
make_submission(model, df_test)

accuracy_score: 0.793909841775301
f1_score: 0.787916026625704
roc_auc_score: 0.7938851672253123
precision_score: 0.8107481559536354
recall_score: 0.7663346613545817


## Oversampling