# do hyperparameter tuning for LightGBM

notebook did not run to completion due to force-restarting of notebook on its own, multiple times  
results obtained using `try_tuned_params.py`

references:

- [LightGBM parameters](https://lightgbm.readthedocs.io/en/latest/Parameters.html)
- [tuning LightGBM](https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html)
- [optuna sample](https://github.com/optuna/optuna/blob/master/examples/lightgbm_simple.py)
- [optuna trial](https://optuna.readthedocs.io/en/stable/reference/trial.html)

## import libraries

In [1]:
import joblib
import multiprocessing
import optuna
import os

import lightgbm as lgb
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## change working directory

In [2]:
os.chdir('..')

## define paths

In [2]:
model_dir_path = 'models/lgbm/'
output_dir_path = 'output/lgbm/'

os.makedirs(model_dir_path, exist_ok=True)
os.makedirs(output_dir_path, exist_ok=True)

train_transaction_data_path = 'data/train_transaction.csv'
train_identity_data_path = 'data/train_identity.csv'
test_transaction_data_path = 'data/test_transaction.csv'
test_identity_data_path = 'data/test_identity.csv'

## define utility function to reduce memory usage

In [3]:
def reduce_mem_usage(df, verbose=True):
    """
    Reduce dataframe size

    params:
    - df: dataframe to reduce the size of

    return:
    - dataframe of reduced size
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'float128']
    start_mem = df.memory_usage().sum() / 1024**2    

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
                elif c_min > np.finfo(np.float128).min and c_max < np.finfo(np.float128).max:
                    df[col] = df[col].astype(np.float128)
                    
    end_mem = df.memory_usage().sum() / 1024**2

    if verbose: 
        print(
            'Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem
        ))

    return df

## list down useless features (known from feature selection)

In [4]:
useless_features = [
    'TransactionID',  # not really a feature
    'dist2',  # transaction features
    'C3',  # C features
    'D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14',  # D features
    'M1',  # M features
    'id_07', 'id_08', 'id_18', 'id_21', 'id_22', 'id_23',  # id features
    'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_35',  # id features
    'V6', 'V8', 'V9', 'V10', 'V11', 'V14', 'V15', 'V16',  # V features
    'V18', 'V21', 'V22', 'V27', 'V28', 'V31', 'V32',  # V features
    'V41', 'V42', 'V46', 'V50', 'V51', 'V59', 'V65',  # V features
    'V68', 'V71', 'V72', 'V79', 'V80', 'V84', 'V85',  # V features
    'V88', 'V89', 'V92', 'V93', 'V95', 'V98', 'V101',  # V features
    'V104', 'V106', 'V107', 'V108', 'V109', 'V110',  # V features
    'V111', 'V112', 'V113', 'V114', 'V116', 'V117',  # V features
    'V118', 'V119', 'V120', 'V121', 'V122', 'V123',  # V features 
    'V125', 'V138', 'V141', 'V142', 'V144', 'V146',  # V features 
    'V147', 'V148', 'V151', 'V153', 'V154', 'V155',  # V features 
    'V157', 'V158', 'V159', 'V161', 'V163', 'V164',  # V features 
    'V166', 'V172', 'V173', 'V174', 'V175', 'V176',  # V features 
    'V177', 'V178', 'V179', 'V180', 'V181', 'V182',  # V features  
    'V183', 'V184', 'V185', 'V186', 'V190', 'V191',  # V features  
    'V192', 'V193', 'V194', 'V195', 'V196', 'V197',  # V features  
    'V198', 'V199', 'V214', 'V216', 'V220', 'V225',  # V features 
    'V226', 'V227', 'V230', 'V233', 'V235', 'V236',  # V features  
    'V237', 'V238', 'V239', 'V240', 'V241', 'V242',  # V features 
    'V244', 'V246', 'V247', 'V248', 'V249', 'V250',  # V features 
    'V252', 'V254', 'V255', 'V269', 'V276', 'V297',  # V features 
    'V300', 'V302', 'V304', 'V305', 'V325', 'V327',  # V features  
    'V328', 'V329', 'V330', 'V334', 'V335', 'V336',  # V features 
    'V337', 'V338', 'V339',  # V features 
]

## define function to disregard OS versions

In [5]:
def ignore_os_version(df, verbose: bool=True):
    """
    params:
    - df (DataFrame): has id_30 as one of its columns
    - verbose (bool): prints information if True

    return: dataframe, after os versions have been ignored
    """
    os_list = [
        'Android',
        'iOS',
        'Mac OS X',
        'Windows',
    ]

    for index, operating_system in df.id_30.iteritems():
        new_os = 'other'

        if isinstance(operating_system, str):
            for known_os in os_list:
                if known_os in operating_system:
                    new_os = known_os
                    break

        df.at[index, 'id_30'] = new_os

    if verbose:
        print('operating systems:', df.id_30.unique())

    return df

## define function to disregard browser versions

In [6]:
def ignore_browser_version(df, verbose: bool=True):
    """
    params:
    - df (DataFrame): has id_31 as one of its columns
    - verbose (bool): prints information if True

    return: dataframe, after browser versions have been ignored
    """
    browser_list = [
        'aol',
        'chrome',
        'chromium',
        'comodo',
        'cyberfox',
        'edge',
        'firefox',
        'icedragon',
        'ie',
        'iron',
        'maxthon',
        'opera',
        'palemoon',
        'puffin',
        'safari',
        'samsung',
        'seamonkey',
        'silk',
        'waterfox',
    ]

    for index, browser in df.id_31.iteritems():
        new_browser = 'other'

        if isinstance(browser, str):
            for known_browser in browser_list:
                if known_browser in browser:
                    new_browser = known_browser
                    break

        df.at[index, 'id_31'] = new_browser

    if verbose:
        print('browsers:', df.id_31.unique())

    return df

## define function for preprocessing data

In [7]:
def preprocess(df, verbose: bool=True):
    """
    Does the following preprocessing steps:
    - disregard os versions
    - disregard browser versions
    - drop useless features
    - convert object columns to string columns
    - imputation (for numbers, fill with interquartile mean)
    - do label encoding for non-numeric values
    - reduce memory usage again

    params:   
    - df (DataFrame): dataframe to preprocess (has columns id_30 and id_31)
    - verbose (bool): prints information if True

    return: dataframe, preprocessing is complete
    """
    df = df.drop(useless_features, axis=1)
    df = ignore_os_version(df, verbose)
    df = ignore_browser_version(df, verbose)

    le = LabelEncoder()

    for column in df.columns:
        if df[column].dtype == 'object':
            df[column]= df[column].astype(str)
            df[column] = le.fit_transform(df[column])
        else:
            df[column] = df[column].fillna(df[column].quantile().mean())

    df = reduce_mem_usage(df, verbose)

    return df

## load and preprocess training data

In [8]:
%%time

transaction_dataframe = pd.read_csv(train_transaction_data_path)
transaction_dataframe = reduce_mem_usage(transaction_dataframe)

identity_dataframe = pd.read_csv(train_identity_data_path)
identity_dataframe = reduce_mem_usage(identity_dataframe)

dataframe = transaction_dataframe.merge(identity_dataframe, how='outer')

del transaction_dataframe
del identity_dataframe

print(f'number of rows in training data: {len(dataframe)}')
dataframe = preprocess(dataframe)
dataframe.head()

Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
number of rows in training data: 590540
operating systems:['other' 'Android' 'iOS' 'Mac OS X' 'Windows']
browsers:['other' 'samsung' 'safari' 'chrome' 'edge' 'firefox' 'ie' 'opera' 'aol'
 'silk' 'waterfox' 'puffin' 'cyberfox' 'palemoon' 'maxthon' 'iron'
 'seamonkey' 'comodo' 'chromium' 'icedragon']
Mem. usage decreased to 357.35 Mb (22.1% reduction)
CPU times: user 1min 16s, sys: 37.7 s, total: 1min 53s
Wall time: 1min 53s


Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0,86400,68.5,4,13926,361.0,150.0,1,142.0,1,...,4,12,24.0,260,4,2,2,2,2,1742
1,0,86401,29.0,4,2755,404.0,150.0,2,102.0,1,...,4,12,24.0,260,4,2,2,2,2,1742
2,0,86469,59.0,4,4663,490.0,150.0,4,166.0,2,...,4,12,24.0,260,4,2,2,2,2,1742
3,0,86499,50.0,4,18132,567.0,150.0,2,117.0,2,...,4,12,24.0,260,4,2,2,2,2,1742
4,0,86506,50.0,1,4497,514.0,150.0,2,102.0,1,...,0,16,32.0,164,3,0,1,1,1,954


## split into training and validation sets

In [9]:
features_dataframe = dataframe.drop('isFraud', axis=1)
is_fraud_data = dataframe['isFraud']

del dataframe

train_features, val_features, train_target, val_target = train_test_split(
    features_dataframe, 
    is_fraud_data, 
    test_size=0.2,
)

train_data = lgb.Dataset(train_features, train_target)
val_data = lgb.Dataset(val_features, val_target)

del train_features
del train_target

## define objective function

In [10]:
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'feature_pre_filter': False,
        'seed': 0,
        'early_stopping_round': 500,
        'num_iterations': 10000,
        'boosting': 'gbdt',  # gbdt > dart; don't know about goss
        'device_type': 'gpu',
        'gpu_use_dp': True,

        'num_leaves': trial.suggest_int('num_leaves', 300, 1500),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 60, 300),
        
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 40, 120),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 1.0),

        'learning_rate': trial.suggest_uniform('learning_rate', 0.001, 0.01),
        
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-6, 1.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-7, 0.1),
        'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-8, 1.0),
    }

    classifier = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        verbose_eval=1000,
    )

    prediction = classifier.predict(val_features)
    return roc_auc_score(val_target, prediction)

## optimise

In [11]:
%%time

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

Training until validation scores don't improve for 500 rounds
[1000]	valid_0's auc: 0.964825
[2000]	valid_0's auc: 0.971581
[3000]	valid_0's auc: 0.973261
[4000]	valid_0's auc: 0.973699
[5000]	valid_0's auc: 0.973866
Early stopping, best iteration is:
[4924]	valid_0's auc: 0.973884
[I 2020-04-26 16:30:57,398] Finished trial#0 with value: 0.9738843115738256 with parameters: {'num_leaves': 1003, 'min_data_in_leaf': 128, 'bagging_fraction': 0.7980799793848067, 'bagging_freq': 74, 'feature_fraction': 0.7936523294908615, 'learning_rate': 0.004900360872407177, 'lambda_l1': 0.7470253151714737, 'lambda_l2': 0.0001989106219058993, 'min_split_gain': 2.4180574527860886e-05}. Best is trial#0 with value: 0.9738843115738256.
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's auc: 0.965643
[2000]	valid_0's auc: 0.972806
[3000]	valid_0's auc: 0.974905
[4000]	valid_0's auc: 0.975783
[5000]	valid_0's auc: 0.976404
[6000]	valid_0's auc: 0.976535
Early stopping, best iteration 

### possible best parameters

Finished trial#0 with value: 0.9738843115738256 with parameters: {'num_leaves': 1003, 'min_data_in_leaf': 128, 'bagging_fraction': 0.7980799793848067, 'bagging_freq': 74, 'feature_fraction': 0.7936523294908615, 'learning_rate': 0.004900360872407177, 'lambda_l1': 0.7470253151714737, 'lambda_l2': 0.0001989106219058993, 'min_split_gain': 2.4180574527860886e-05}.

Finished trial#1 with value: 0.9765724053029845 with parameters: {'num_leaves': 395, 'min_data_in_leaf': 63, 'bagging_fraction': 0.9904031026433603, 'bagging_freq': 52, 'feature_fraction': 0.8261886155632686, 'learning_rate': 0.005744872565276946, 'lambda_l1': 0.001973309169415503, 'lambda_l2': 0.07395166262918815, 'min_split_gain': 0.00042833111881948957}.

Finished trial#2 with value: 0.9695899574903211 with parameters: {'num_leaves': 1242, 'min_data_in_leaf': 174, 'bagging_fraction': 0.6561500738099412, 'bagging_freq': 42, 'feature_fraction': 0.8573933020623068, 'learning_rate': 0.0024942510755431497, 'lambda_l1': 1.3942948107225643e-05, 'lambda_l2': 2.87261729998794e-07, 'min_split_gain': 2.7034128429582066e-05}.

Finished trial#3 with value: 0.9752841433665376 with parameters: {'num_leaves': 357, 'min_data_in_leaf': 239, 'bagging_fraction': 0.8981245960747626, 'bagging_freq': 42, 'feature_fraction': 0.6757366405684708, 'learning_rate': 0.002643965632778826, 'lambda_l1': 0.00020908461399848322, 'lambda_l2': 2.282598591643265e-06, 'min_split_gain': 0.00539175228082824}.

Finished trial#4 with value: 0.9708231123430042 with parameters: {'num_leaves': 1344, 'min_data_in_leaf': 218, 'bagging_fraction': 0.6065496321290093, 'bagging_freq': 118, 'feature_fraction': 0.6640283101725976, 'learning_rate': 0.007935010372959427, 'lambda_l1': 0.011638086567711713, 'lambda_l2': 0.0010267096855244486, 'min_split_gain': 0.0020971974354108075}.

Finished trial#5 with value: 0.9763266077067514 with parameters: {'num_leaves': 403, 'min_data_in_leaf': 249, 'bagging_fraction': 0.9861010565253743, 'bagging_freq': 80, 'feature_fraction': 0.6195221945260142, 'learning_rate': 0.008060560032932075, 'lambda_l1': 0.00044378050004232624, 'lambda_l2': 0.000699112895547415, 'min_split_gain': 0.0015068062433125815}.

Finished trial#6 with value: 0.9739542439040608 with parameters: {'num_leaves': 839, 'min_data_in_leaf': 170, 'bagging_fraction': 0.8244253697699655, 'bagging_freq': 53, 'feature_fraction': 0.8547519124735501, 'learning_rate': 0.0019732169948411624, 'lambda_l1': 2.4326283506019405e-06, 'lambda_l2': 0.0010601004902769668, 'min_split_gain': 6.8015876865516326e-06}.

## Kaggle result

~|reference params|trial 1 (best)|trial 5 (2nd)|trial 3 (3rd)
-|-|-|-|-
public score|0.939755|0.938188|0.934179|0.936401
private score|0.911989|0.907923|0.905338|0.903841