In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/santander-customer-transaction-prediction/sample_submission.csv
/kaggle/input/santander-customer-transaction-prediction/train.csv
/kaggle/input/santander-customer-transaction-prediction/test.csv


In [2]:
df_train = pd.read_csv("/kaggle/input/santander-customer-transaction-prediction/train.csv")
df_test = pd.read_csv("/kaggle/input/santander-customer-transaction-prediction/test.csv")

1) Use 20k out of 170k negative examples and all 20k positive\
2) Split them on train & val sets\
3) Train model with cv and no cv

In [3]:
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
import xgboost as xgb



# Models with no data preprocessing and feature engineering

In [4]:
X = df_train.sample(200000)
X_train, X_val, y_train, y_val = train_test_split(X.iloc[:,2:], X['target'], stratify=X['target'], test_size=0.25)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5)

In [5]:
%%time
xgb_model = xgb.XGBClassifier(tree_method='hist', early_stopping_rounds=2, device='gpu')
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)

Parameters: { "device" } are not used.

CPU times: user 49.6 s, sys: 248 ms, total: 49.8 s
Wall time: 12.7 s


In [6]:
def validate_model(model, X_val, y_val):
    predictions = model.predict(X_val)    
    metrics = [accuracy_score, f1_score, roc_auc_score,
               precision_score, recall_score]
    for metric in metrics:
        print(f"{metric.__name__}: {metric(y_val, predictions.round())}")
        
def make_submission(model, data):
    ids = data['ID_code']
    preds = model.predict_proba(data.iloc[:, 1:])
    pd.DataFrame(data=zip(ids, preds[:,1]), columns=['ID_code', 'target']).to_csv("submission.csv", index=False)

In [7]:
validate_model(xgb_model, X_val, y_val)

accuracy_score: 0.91212
f1_score: 0.335249621785174
roc_auc_score: 0.6051776056264065
precision_score: 0.6899128268991283
recall_score: 0.2214228617106315


## Grid Search hyperparameters

In [8]:
# %%time
# from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# param_grid = {
#     "max_depth": [4,5,6,7],
#     "learning_rate": [0.2, 0.1, 0.05, 0.01],
#     "subsample": [0.5, 0.7, 1]
# }

# estim = xgb.XGBClassifier(tree_method='gpu_hist', device='gpu')

# grid_search = GridSearchCV(estim, param_grid, scoring='roc_auc', n_jobs=-1, cv=5)
# grid_search.fit(X_train, y_train)

## Random Search hyperparameters

In [9]:
# %%time
# import scipy.stats as stats

# param_dist = {
#     "min_child_weight": stats.uniform(0, 5),
#     "colsample_bytree": stats.uniform(0.5, 0.5),
#     "reg_alpha": stats.uniform(0, 1),
#     "learning_rate": stats.uniform(0.01, 0.5)
# }

# estim = xgb.XGBClassifier(tree_method='gpu_hist')
# random_search = RandomizedSearchCV(estim, param_dist, n_iter=25, cv=5, scoring="roc_auc")
# random_search.fit(X_train, y_train)

## Train Model using best parameters of Grid Search and Random Search

In [10]:
# best_p = random_search.best_params_

In [11]:
# model = xgb.XGBClassifier(**best_p, tree_method='gpu_hist', early_stopping_rounds=2, n_estimators=1000)
# model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
# validate_model(model, X_val, y_val)

## Try Baesian Optimization (Hyperopt)

In [12]:
from hyperopt import fmin, tpe, hp

space = {
    "min_child_weight": hp.uniform("min_child_weight",0, 5),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
    "reg_alpha": hp.uniform("reg_alpha", 0, 1),
    "learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
    "subsample": hp.uniform("subsample",0.5, 1),
    "max_depth": hp.choice("max_depth", range(3,8))
}

def objective(params):
    xg_model = xgb.XGBClassifier(**params, tree_method='gpu_hist')
    xg_model.fit(X_train, y_train)
    y_pred = xg_model.predict(X_val)
    loss = roc_auc_score(y_val, y_pred)
    return -loss

best_params = fmin(objective, space, algo=tpe.suggest, max_evals=100)

100%|██████████| 100/100 [05:17<00:00,  3.18s/trial, best loss: -0.65715080171617]


In [13]:
# best_params
model = xgb.XGBClassifier(**best_params, tree_method='gpu_hist', early_stopping_rounds=2, n_estimators=1000)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
validate_model(model, X_test, y_test)
make_submission(model, df_test)

accuracy_score: 0.91724
f1_score: 0.42383737120579223
roc_auc_score: 0.6439833215866183
precision_score: 0.7125468164794008
recall_score: 0.30162504954419345


# Imbalance Data Handling techniques

In [14]:
# we need to try a couple of techniques to handle imbalanced data.
# so we create a function that raises and evaluates the model
def model_testing(parameters, train_set, val_set, n_estimators=200):
    model = xgb.XGBClassifier(**parameters, tree_method='gpu_hist', early_stopping_rounds=2, n_estimators=n_estimators)
    model.fit(train_set[0], train_set[1], eval_set=[val_set], verbose=0)
    
    validate_model(model, val_set[0], val_set[1])
    return model  

## 1) Undersampling

In [15]:
# df_train_pos = df_train[df_train['target'] == 1]
# df_train_neg = df_train[df_train['target'] == 0].sample(20098)


# df_train_reduced = pd.concat((df_train_pos, df_train_neg), axis=0).sample(40196)
# X_train, X_val, y_train, y_val = train_test_split(df_train_reduced.iloc[:, 2:], df_train_reduced['target'], test_size=0.5)
# X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5)

In [16]:
# space = {
#     "min_child_weight": hp.uniform("min_child_weight",0, 5),
#     "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
#     "reg_alpha": hp.uniform("reg_alpha", 0, 1),
#     "learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
#     "subsample": hp.uniform("subsample",0.5, 1),
#     "max_depth": hp.choice("max_depth", range(3,8))
# }

# def objective(params):
#     xg_model = xgb.XGBClassifier(**params, tree_method='gpu_hist')
#     xg_model.fit(X_train, y_train)
#     y_pred = xg_model.predict(X_val)
#     loss = roc_auc_score(y_val, y_pred)
#     return -loss

# best_params = fmin(objective, space, algo=tpe.suggest, max_evals=100)

In [17]:
# model = model_testing(best_params, (X_train, y_train), (X_val, y_val), 1000)
# make_submission(model, df_test)

## 2) Oversampling (ADASYN)

In [18]:
# from imblearn.over_sampling import ADASYN

# adasyn = ADASYN()
# X_ada, y_ada = adasyn.fit_resample(X.iloc[:,2:], X['target'])

# X_train, X_val, y_train, y_val = train_test_split(X_ada, y_ada, test_size=0.2)
# X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5)

In [19]:
# model = model_testing(best_params, (X_train, y_train), (X_val, y_val), 1000)
# make_submission(model, df_test)

## Oversampling (SMOTE)

In [20]:
# from imblearn.over_sampling import SMOTE

# smote = SMOTE()
# X_sm, y_sm = smote.fit_resample(X.iloc[:,2:], X['target'])

# X_train, X_val, y_train, y_val = train_test_split(X_sm, y_sm, test_size=0.2)
# X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5)

In [21]:
# model = model_testing(best_params, (X_train, y_train), (X_val, y_val), 1000)
# make_submission(model, df_test)

## 4) Oversampling (SMOTE+ENN)

In [22]:
# from imblearn.combine import SMOTEENN

# smoteenn = SMOTEENN(sampling_strategy=0.6)
# X_smenn, y_smenn = smoteenn.fit_resample(X.iloc[:,2:], X['target'])

# X_train, X_val, y_train, y_val = train_test_split(X_smenn, y_smenn, test_size=0.2)
# X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5)

In [23]:
# model = model_testing(best_params, (X_train, y_train), (X_val, y_val), 10)
# make_submission(model, df_test)

In [24]:
# validate_model(model, X_test, y_test)

## 5) Class Weight Adjusting