In [1]:
import gc
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools
from itertools import combinations

import random
import joblib
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

from pd.params import *



In [7]:
def amex_metric(y_true, y_pred, return_components=False):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    if return_components:
        return 0.5 * (gini[1]/gini[0] + top_four), gini[1]/gini[0], top_four

    return 0.5 * (gini[1]/gini[0] + top_four)

def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    score, gini, recall = amex_metric(y_true, y_pred, return_components=True)
    return f'amex_metric gini {gini:.3f} recall {recall:.3f}', score, True


In [10]:
params = {
        'objective': 'binary',
        'metric': "binary_logloss",
        'boosting': 'dart',
        'seed': 42,
        'num_leaves': 100,
        'learning_rate': 0.01,
        'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,
        'n_jobs': -1,
        'lambda_l2': 8,
        'min_data_in_leaf': 40
        }

In [5]:
train_data = np.load(OUTDIR+"train_data_all.npy").transpose((0, 2, 1))
train_labels = np.load(OUTDIR+"train_labels_all.npy")


In [4]:
train_data = np.load(OUTDIR+"train13_raw_all_data.npy")
train_labels = np.load(OUTDIR+"train13_raw_all_labels.npy")

In [6]:

X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=1/9, random_state=0, shuffle=True)
validation_data = (X_test, y_test)


In [9]:
lgb_train = lgb.Dataset(X_train.reshape(X_train.shape[0], -1), y_train)
lgb_valid = lgb.Dataset(X_test.reshape(X_test.shape[0], -1), y_test,)
model = lgb.train(
            params = params,
            train_set = lgb_train,
            num_boost_round = 1500,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 100,
            verbose_eval = 100,
            feval = lgb_amex_metric
            )
        

[LightGBM] [Info] Number of positive: 79627, number of negative: 263514
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 328368
[LightGBM] [Info] Number of data points in the train set: 343141, number of used features: 2425
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.232053 -> initscore=-1.196753
[LightGBM] [Info] Start training from score -1.196753
[100]	training's binary_logloss: 0.447653	training's amex_metric gini 0.918 recall 0.656: 0.787208	valid_1's binary_logloss: 0.449236	valid_1's amex_metric gini 0.911 recall 0.631: 0.770685
[200]	training's binary_logloss: 0.430692	training's amex_metric gini 0.919 recall 0.661: 0.790093	valid_1's binary_logloss: 0.433192	valid_1's amex_metric gini 0.911 recall 0.635: 0.77314
[300]	training's binary_logloss: 0.3763	training's amex_metric gini 0.921 recall 0.668: 0.794409	valid_1's binary_logloss: 0.379827	valid_1's amex_metric gini 0.

In [12]:
lgb_train = lgb.Dataset(X_train.reshape(X_train.shape[0], -1), y_train)
lgb_valid = lgb.Dataset(X_test.reshape(X_test.shape[0], -1), y_test,)
model = lgb.train(
            params = params,
            train_set = lgb_train,
            num_boost_round = 10500,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 100,
            verbose_eval = 100,
            feval = lgb_amex_metric
            )
        

[LightGBM] [Info] Number of positive: 79627, number of negative: 263514
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 328368
[LightGBM] [Info] Number of data points in the train set: 343141, number of used features: 2425
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.232053 -> initscore=-1.196753
[LightGBM] [Info] Start training from score -1.196753
[100]	training's binary_logloss: 0.448219	training's amex_metric gini 0.917 recall 0.652: 0.784676	valid_1's binary_logloss: 0.449789	valid_1's amex_metric gini 0.910 recall 0.628: 0.769098
[200]	training's binary_logloss: 0.431256	training's amex_metric gini 0.919 recall 0.658: 0.788337	valid_1's binary_logloss: 0.433742	valid_1's amex_metric gini 0.911 recall 0.635: 0.772682
[300]	training's binary_logloss: 0.376935	training's amex_metric gini 0.920 recall 0.666: 0.7932	valid_1's binary_logloss: 0.380447	valid_1's amex_metric gini 0

In [13]:

joblib.dump(model, OUTDIR+f'lgbm13.pkl')


['/Users/nimamanaf/Desktop/kaggle/pd/data/out/lgbm13.pkl']

In [11]:
def train_and_evaluate(train, test=None, n_folds=5, seed=42):
    cat_features = [f"{cf}_last" for cf in CATCOLS]
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = encoder.fit_transform(train[cat_col])
        if test is not None:
            test[cat_col] = encoder.transform(test[cat_col])
    # Round last float features to 2 decimal place
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    num_cols = [col for col in num_cols if 'last' in col]
    for col in num_cols:
        train[col + '_round2'] = train[col].round(2)
        if test is not None:
            test[col + '_round2'] = test[col].round(2)
    # Get feature list
    features = [col for col in train.columns if col not in ['customer_ID', "S_2", "target"]]
    
    # Create a numpy array to store test predictions
    test_predictions = np.zeros(len(test))
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train["target"])):
        print(' ')
        print('-'*50)
        print(f'Training fold {fold} with {len(features)} features...')
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train["target"].iloc[trn_ind], train["target"].iloc[val_ind]
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = cat_features)
        lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)
        model = lgb.train(
            params = params,
            train_set = lgb_train,
            num_boost_round = 10500,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 100,
            verbose_eval = 100,
            feval = lgb_amex_metric
            )
        # Save best model
        joblib.dump(model, OUTDIR+f'Models/lgbm_fold{fold}_seed{seed}.pkl')
        val_pred = model.predict(x_val) # Predict validation
        oof_predictions[val_ind] = val_pred  # Add to out of folds array
        if test is not None:
            test_pred = model.predict(test[features]) # Predict the test set
            test_predictions += test_pred/n_folds
        # Compute fold metric
        score = amex_metric(y_val, val_pred)
        print(f'Our fold {fold} CV score is {score}')
        del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
        gc.collect()
    score = amex_metric(train["target"], oof_predictions)  # Compute out of folds metric
    print(f'Our out of folds CV score is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train["target"], 'prediction': oof_predictions})
    oof_df.to_csv(OUTDIR+f'oof_lgbm_baseline_{n_folds}fold_seed{seed}.csv', index=False)
    # Create a dataframe to store test prediction
    if test is not None:
        test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
        test_df.to_csv(f'/content/drive/MyDrive/Amex/Predictions/test_lgbm_baseline_{n_folds}fold_seed{seed}.csv', index = False)
    
    return model 