# Augmented answer generation

In [1]:
from sklearn.model_selection import StratifiedKFold
from skopt.space import Real, Integer
from skopt.utils import use_named_args
import itertools
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
from fastai import *
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

In [2]:
TRAIN = 'data/train.csv'
train = pd.read_csv(TRAIN)
X = train.drop(['ID_code', 'target'], axis=1)
y = train.target

In [3]:
import pickle
def save_model(models, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(models, handle)

In [4]:
def get_preds(X, y, params):
    skf = StratifiedKFold(n_splits=5)
    models = []
    for train_idx, test_idx in skf.split(X, y):
            trn_data = lgb.Dataset(X[train_idx], label=y[train_idx])
            test_data = lgb.Dataset(X[test_idx], label=y[test_idx])
            model_lgb     = lgb.train(params, trn_data, 100000, valid_sets = [trn_data, test_data], verbose_eval=5000, early_stopping_rounds = 4000)
            models.append(model_lgb)
            auc = roc_auc_score(y[test_idx], model_lgb.predict(X[test_idx]))
            print(f"This AUC: {auc}")
    return models

In [5]:
params = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0053,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1,
    'gpu_platform_id': -1,
}

### Random super-sampling

In [6]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [7]:
random_trained_models = get_preds(X_resampled, y_resampled, params)

Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.920294	valid_1's auc: 0.915505
[10000]	training's auc: 0.934396	valid_1's auc: 0.927559
[15000]	training's auc: 0.945706	valid_1's auc: 0.937056
[20000]	training's auc: 0.95526	valid_1's auc: 0.945119
[25000]	training's auc: 0.963304	valid_1's auc: 0.952058
[30000]	training's auc: 0.970102	valid_1's auc: 0.958076
[35000]	training's auc: 0.975699	valid_1's auc: 0.963216
[40000]	training's auc: 0.980349	valid_1's auc: 0.967655
[45000]	training's auc: 0.98412	valid_1's auc: 0.971439
[50000]	training's auc: 0.987216	valid_1's auc: 0.974739
[55000]	training's auc: 0.989739	valid_1's auc: 0.977526
[60000]	training's auc: 0.991776	valid_1's auc: 0.97994
[65000]	training's auc: 0.993425	valid_1's auc: 0.982034
[70000]	training's auc: 0.994753	valid_1's auc: 0.983844
[75000]	training's auc: 0.995824	valid_1's auc: 0.98542
[80000]	training's auc: 0.996682	valid_1's auc: 0.986787
[85000]	training's auc: 0.997

In [8]:
save_model(random_trained_models, 'random_aug_lgbm_models_cv10.m')

### SMOTE

In [9]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [10]:
smote_trained_models = get_preds(X_resampled, y_resampled, params)

Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.97333	valid_1's auc: 0.813928
[10000]	training's auc: 0.980199	valid_1's auc: 0.834526
[15000]	training's auc: 0.983753	valid_1's auc: 0.846639
[20000]	training's auc: 0.986295	valid_1's auc: 0.855567
[25000]	training's auc: 0.988356	valid_1's auc: 0.862391
[30000]	training's auc: 0.99013	valid_1's auc: 0.86792
[35000]	training's auc: 0.991656	valid_1's auc: 0.873065
[40000]	training's auc: 0.992983	valid_1's auc: 0.877587
[45000]	training's auc: 0.994121	valid_1's auc: 0.881634
[50000]	training's auc: 0.995098	valid_1's auc: 0.885424
[55000]	training's auc: 0.995928	valid_1's auc: 0.888881
[60000]	training's auc: 0.996637	valid_1's auc: 0.892138
[65000]	training's auc: 0.997233	valid_1's auc: 0.895116
[70000]	training's auc: 0.997739	valid_1's auc: 0.897938
[75000]	training's auc: 0.99816	valid_1's auc: 0.900547
[80000]	training's auc: 0.998507	valid_1's auc: 0.902909
[85000]	training's auc: 0.998

In [11]:
save_model(smote_trained_models, 'smote_aug_lgbm_models_cv10.m')

### ADASYN

In [None]:
from imblearn.over_sampling import  ADASYN
adasyn = ADASYN(random_state=0)
X_resampled, y_resampled = adasyn.fit_resample(X, y)

In [None]:
adasyn_trained_models = get_preds(X_resampled, y_resampled, params)

In [None]:
save_model(adasyn_trained_models, 'adasyn_aug_lgbm_models_cv10.m')

### Borderling SMOTE

In [None]:
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE(random_state=0)
X_resampled, y_resampled = bsmote.fit_resample(X, y)

In [None]:
bsmote_trained_models = get_preds(X_resampled, y_resampled, params)

In [None]:
save_model(bsmote_trained_models, 'bsmote_aug_lgbm_models_cv10.m')