# Helper

In [2]:
def save_model(models, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(models, handle)
        
def load_model(filename):
    with open(filename, 'rb') as handle:
        model = pickle.load(handle)
    return model    
        
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)
    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

def lgb_trainer(X, y, params, n_folds):
    skf = StratifiedKFold(n_splits=n_folds)
    models = []
    for train_idx, test_idx in tqdm(skf.split(X.values, y.values)):
            gc.collect()
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_tr, y_tr = augment(X_train.values, y_train.values)
            X_tr = pd.DataFrame(X_tr)
            trn_data = lgb.Dataset(X_tr, label=y_tr)
            test_data = lgb.Dataset(X.values[test_idx], label=y.values[test_idx])
            model_lgb     = lgb.train(params, trn_data, 1000000, valid_sets = [trn_data, test_data], verbose_eval=5000, early_stopping_rounds = 4000)
            models.append(model_lgb)
            auc = roc_auc_score(y.values[test_idx], model_lgb.predict(X.values[test_idx]))
    return models

def lgb_trainer_no_aug(X, y, params, n_folds):
    skf = StratifiedKFold(n_splits=n_folds)
    models = []
    for train_idx, test_idx in tqdm(skf.split(X.values, y.values)):
            gc.collect()
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_tr, y_tr = X_train.values, y_train.values
            X_tr = pd.DataFrame(X_tr)
            trn_data = lgb.Dataset(X_tr, label=y_tr)
            test_data = lgb.Dataset(X.values[test_idx], label=y.values[test_idx])
            model_lgb     = lgb.train(params, trn_data, 1000000, valid_sets = [trn_data, test_data], verbose_eval=5000, early_stopping_rounds = 4000)
            models.append(model_lgb)
            auc = roc_auc_score(y.values[test_idx], model_lgb.predict(X.values[test_idx]))
    return models

def lgb_trainer_for_bayesian_optim(X, y, params):
        X_tr, X_val, y_tr, y_val  = train_test_split(X, y, test_size = 0.2, random_state=42)    
        trn_data = lgb.Dataset(X_tr, label=y_tr)
        test_data = lgb.Dataset(X_val, label=y_val)
        model_lgb     = lgb.train(params, trn_data, 1000000, valid_sets = [trn_data, test_data], verbose_eval=5000, early_stopping_rounds = 4000)
        auc = roc_auc_score(y_val, model_lgb.predict(X_val))
        return auc

def test_f(X, y, models):
    preds = pd.DataFrame({})
    for i, model in enumerate(models):
        preds[str(i)] = model.predict(X)
        print(f"Fold: {i} \t Score: {roc_auc_score(y, preds[str(i)].values)}")
    averaged_preds = preds.mean(axis=1)
    scorre = roc_auc_score(y, averaged_preds)
    print(f"Score: {scorre}")
    return scorre, averaged_preds, preds

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import pickle
import gc
import seaborn as sns
import scikitplot as skplt
from tqdm import tqdm_notebook as tqdm
import warnings
warnings.filterwarnings("ignore")
TRAIN = 'data/train.csv'
# TEST = 'data/test.csv'
TEST = 'data/da_real_test.csv'
SAMPLE = 'data/sample_submission.csv'

In [3]:
def augment_uniques(data):
    maps = {}
    for feature in tqdm(data.columns):
        if feature in ['ID_code', 'target']:
            continue
        a, b = np.unique(data[feature], return_counts=True)
        unique_map = dict(zip(a, b))
        maps[feature] = unique_map
        data[f'count_{feature}'] = data[feature].map(unique_map)
    return data, maps  