# do average voting using multiple LightGBM classifiers

## import libraries

In [1]:
import joblib
import os
import time

import lightgbm as lgb
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## change working directory

In [2]:
os.chdir('..')

## define paths

In [3]:
model_dir_path = 'models/voting/'
output_path = 'output/final_prediction.csv'

os.makedirs(model_dir_path, exist_ok=True)
os.makedirs('output/', exist_ok=True)

train_transaction_data_path = 'data/train_transaction.csv'
train_identity_data_path = 'data/train_identity.csv'
test_transaction_data_path = 'data/test_transaction.csv'
test_identity_data_path = 'data/test_identity.csv'

## list down useless features (known from feature selection)

In [4]:
useless_features = [
    'TransactionID',  # not really a feature
    'dist2',  # transaction features
    'C3',  # C features
    'D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14',  # D features
    'M1',  # M features
    'id_07', 'id_08', 'id_18', 'id_21', 'id_22', 'id_23',  # id features
    'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_35',  # id features
    'V6', 'V8', 'V9', 'V10', 'V11', 'V14', 'V15', 'V16',  # V features
    'V18', 'V21', 'V22', 'V27', 'V28', 'V31', 'V32',  # V features
    'V41', 'V42', 'V46', 'V50', 'V51', 'V59', 'V65',  # V features
    'V68', 'V71', 'V72', 'V79', 'V80', 'V84', 'V85',  # V features
    'V88', 'V89', 'V92', 'V93', 'V95', 'V98', 'V101',  # V features
    'V104', 'V106', 'V107', 'V108', 'V109', 'V110',  # V features
    'V111', 'V112', 'V113', 'V114', 'V116', 'V117',  # V features
    'V118', 'V119', 'V120', 'V121', 'V122', 'V123',  # V features 
    'V125', 'V138', 'V141', 'V142', 'V144', 'V146',  # V features 
    'V147', 'V148', 'V151', 'V153', 'V154', 'V155',  # V features 
    'V157', 'V158', 'V159', 'V161', 'V163', 'V164',  # V features 
    'V166', 'V172', 'V173', 'V174', 'V175', 'V176',  # V features 
    'V177', 'V178', 'V179', 'V180', 'V181', 'V182',  # V features  
    'V183', 'V184', 'V185', 'V186', 'V190', 'V191',  # V features  
    'V192', 'V193', 'V194', 'V195', 'V196', 'V197',  # V features  
    'V198', 'V199', 'V214', 'V216', 'V220', 'V225',  # V features 
    'V226', 'V227', 'V230', 'V233', 'V235', 'V236',  # V features  
    'V237', 'V238', 'V239', 'V240', 'V241', 'V242',  # V features 
    'V244', 'V246', 'V247', 'V248', 'V249', 'V250',  # V features 
    'V252', 'V254', 'V255', 'V269', 'V276', 'V297',  # V features 
    'V300', 'V302', 'V304', 'V305', 'V325', 'V327',  # V features  
    'V328', 'V329', 'V330', 'V334', 'V335', 'V336',  # V features 
    'V337', 'V338', 'V339',  # V features 
]


## define utility function to reduce memory usage

In [5]:
def reduce_mem_usage(df, verbose=True):
    """
    Reduce dataframe size

    params:
    - df: dataframe to reduce the size of

    return:
    - dataframe of reduced size
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'float128']
    start_mem = df.memory_usage().sum() / 1024**2    

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
                elif c_min > np.finfo(np.float128).min and c_max < np.finfo(np.float128).max:
                    df[col] = df[col].astype(np.float128)
                    
    end_mem = df.memory_usage().sum() / 1024**2

    if verbose: 
        print(
            'Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem
        ))

    return df

## define function to load training data

In [6]:
def load_training_data() -> pd.DataFrame:
    transaction_dataframe = pd.read_csv(train_transaction_data_path)
    transaction_dataframe = reduce_mem_usage(transaction_dataframe)

    identity_dataframe = pd.read_csv(train_identity_data_path)
    identity_dataframe = reduce_mem_usage(identity_dataframe)

    dataframe = transaction_dataframe.merge(identity_dataframe, how='outer')

    print(f'number of rows in training data: {len(dataframe)}')
    return dataframe

## define function to load test data

In [7]:
def load_test_data():
    transaction_dataframe = pd.read_csv(test_transaction_data_path)
    transaction_dataframe = reduce_mem_usage(transaction_dataframe)

    identity_dataframe = pd.read_csv(test_identity_data_path)
    identity_dataframe = reduce_mem_usage(identity_dataframe)
    identity_dataframe = identity_dataframe.rename(
        columns={
            column: column.replace('-', '_')
            for column in identity_dataframe.columns
        }
    )

    dataframe = transaction_dataframe.merge(identity_dataframe, how='outer')
    transaction_id_data = dataframe['TransactionID']  # need it for output

    print(f'number of rows in test data: {len(dataframe)}')
    return dataframe, transaction_id_data

## define function to disregard browser versions

In [8]:
def ignore_browser_version(df: pd.DataFrame, verbose: bool=True):
    """
    params:
    - df (pd.DataFrame): has id_31 as one of its columns
    - verbose (bool): prints information if True

    return: dataframe, after browser versions have been ignored
    """
    browser_list = [
        'aol',
        'chrome',
        'chromium',
        'comodo',
        'cyberfox',
        'edge',
        'firefox',
        'icedragon',
        'ie',
        'iron',
        'maxthon',
        'opera',
        'palemoon',
        'puffin',
        'safari',
        'samsung',
        'seamonkey',
        'silk',
        'waterfox',
    ]

    for index, browser in df.id_31.iteritems():
        if not isinstance(browser, str):
            continue  # nan remains as nan

        new_browser = 'other'

        for known_browser in browser_list:
            if known_browser in browser:
                new_browser = known_browser
                break

        df.at[index, 'id_31'] = new_browser

    if verbose:
        print('browsers:', df.id_31.unique())

    return df

## define function to generate aggregations

- [reference](https://www.kaggle.com/artgor/eda-and-models#Feature-engineering)

In [9]:
def generate_aggregations(df: pd.DataFrame) -> pd.DataFrame:
    """
    params:
    - df (pd.DataFrame): dataframe to generate aggregations on

    return:
    - dataframe with aggregations
    """
    df['TransactionAmt_to_mean_card1'] = df['TransactionAmt'] / df.groupby(['card1'])['TransactionAmt'].transform('mean')
    df['TransactionAmt_to_mean_card4'] = df['TransactionAmt'] / df.groupby(['card4'])['TransactionAmt'].transform('mean')
    df['TransactionAmt_to_std_card1'] = df['TransactionAmt'] / df.groupby(['card1'])['TransactionAmt'].transform('std')
    df['TransactionAmt_to_std_card4'] = df['TransactionAmt'] / df.groupby(['card4'])['TransactionAmt'].transform('std')

    df['id_02_to_mean_card1'] = df['id_02'] / df.groupby(['card1'])['id_02'].transform('mean')
    df['id_02_to_mean_card4'] = df['id_02'] / df.groupby(['card4'])['id_02'].transform('mean')
    df['id_02_to_std_card1'] = df['id_02'] / df.groupby(['card1'])['id_02'].transform('std')
    df['id_02_to_std_card4'] = df['id_02'] / df.groupby(['card4'])['id_02'].transform('std')

    df['D15_to_mean_card1'] = df['D15'] / df.groupby(['card1'])['D15'].transform('mean')
    df['D15_to_mean_card4'] = df['D15'] / df.groupby(['card4'])['D15'].transform('mean')
    df['D15_to_std_card1'] = df['D15'] / df.groupby(['card1'])['D15'].transform('std')
    df['D15_to_std_card4'] = df['D15'] / df.groupby(['card4'])['D15'].transform('std')

    df['D15_to_mean_addr1'] = df['D15'] / df.groupby(['addr1'])['D15'].transform('mean')
    df['D15_to_mean_addr2'] = df['D15'] / df.groupby(['addr2'])['D15'].transform('mean')
    df['D15_to_std_addr1'] = df['D15'] / df.groupby(['addr1'])['D15'].transform('std')
    df['D15_to_std_addr2'] = df['D15'] / df.groupby(['addr2'])['D15'].transform('std')

    df[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = df['P_emaildomain'].str.split('.', expand=True)
    df[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = df['R_emaildomain'].str.split('.', expand=True)

    return df

## define function for preprocessing data

In [10]:
def preprocess(df: pd.DataFrame, verbose: bool=True) -> pd.DataFrame:
    """
    params:
    - df (pd.DataFrame): dataframe that contains data for preprocessing
    - verbose (bool): prints information if True

    return:
    - processed dataframe
    """
    df = df.drop(useless_features, axis=1)

    df = ignore_browser_version(df)
    df = generate_aggregations(df)

    num_categories_cutoff = 30
    le = LabelEncoder()

    for column in df.columns:
        if df[column].dtype == 'object':
            if df[column].nunique() <= num_categories_cutoff:
                df[column]= df[column].astype('category')
            else:
                df[column] = df[column].astype(str)
                df[column] = le.fit_transform(df[column])
        else:
            df[column] = df[column].fillna(df[column].median())

    df = reduce_mem_usage(df)
    return df

## define function to train LightGBM

- using [reference notebook parameters](https://www.kaggle.com/nroman/lgb-single-model-lb-0-9419)

In [11]:
def train_lgbm(df: pd.DataFrame) -> lgb.Booster:
    """
    params:
    - df (pd.DataFrame): data to be used to train lgbm

    return:
    - trained lightgbm classifier
    """
    is_fraud_data = dataframe['isFraud']
    features_dataframe = dataframe.drop('isFraud', axis=1)

    train_features, val_features, train_target, val_target = train_test_split(
        features_dataframe, 
        is_fraud_data, 
        test_size=0.1,
    )

    del features_dataframe
    del is_fraud_data

    train_data = lgb.Dataset(train_features, train_target)
    val_data = lgb.Dataset(val_features, val_target)

    del train_features
    del train_target
    del val_features
    del val_target

    params = {
        'num_leaves': 491,
        'min_child_weight': 0.03454472573214212,
        'feature_fraction': 0.3797454081646243,
        'bagging_fraction': 0.4181193142567742,
        'min_data_in_leaf': 106,
        'objective': 'binary',
        'max_depth': -1,
        'learning_rate': 0.006883242363721497,
        'boosting_type': 'gbdt',
        'bagging_seed': 11,
        'metric': 'auc',
        'verbosity': -1,
        'reg_alpha': 0.3899927210061127,
        'reg_lambda': 0.6485237330340494,
        'random_state': 47,
    }

    return lgb.train(
        params, 
        train_set=train_data, 
        num_boost_round=10000, 
        valid_sets=[train_data, val_data],
        verbose_eval=2000,
        early_stopping_rounds=500,
    )

## load and preprocess training data

In [12]:
%%time

dataframe = load_training_data()
dataframe = preprocess(dataframe)

dataframe.head()

Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
number of rows in training data: 590540
browsers: [nan 'samsung' 'safari' 'chrome' 'edge' 'firefox' 'ie' 'other' 'opera'
 'aol' 'silk' 'waterfox' 'puffin' 'cyberfox' 'palemoon' 'maxthon' 'iron'
 'seamonkey' 'comodo' 'chromium' 'icedragon']
Mem. usage decreased to 378.75 Mb (7.6% reduction)
CPU times: user 1min 10s, sys: 36 s, total: 1min 46s
Wall time: 1min 46s


Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,D15_to_mean_addr1,D15_to_mean_addr2,D15_to_std_addr1,D15_to_std_addr2,P_emaildomain_1,P_emaildomain_2,P_emaildomain_3,R_emaildomain_1,R_emaildomain_2,R_emaildomain_3
0,0,86400,68.5,W,13926,361.0,150.0,discover,142.0,credit,...,0.0,0.0,0.0,0.0,25,,,25,,
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,15,com,,25,,
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,1.611328,1.72168,1.486328,1.522461,28,com,,25,,
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,0.686035,0.606445,0.57666,0.536621,43,com,,25,,
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,0.459473,0.458984,0.408691,0.406006,15,com,,25,,


## train base classifiers

In [13]:
%%time

num_base_classifiers = 20

for index in range(num_base_classifiers):
    print(f'training base classifier {index}')

    classifier = train_lgbm(dataframe)
    joblib.dump(classifier, model_dir_path + 'base_' + str(index) + '.joblib')  # save model

    time.sleep(120)  # let my computer cool down, or it may overheat

training base classifier 0
Training until validation scores don't improve for 500 rounds
[2000]	training's auc: 0.9995	valid_1's auc: 0.974259
[4000]	training's auc: 0.999997	valid_1's auc: 0.976622
[6000]	training's auc: 1	valid_1's auc: 0.976866
Early stopping, best iteration is:
[5950]	training's auc: 1	valid_1's auc: 0.976886
training base classifier 1
Training until validation scores don't improve for 500 rounds
[2000]	training's auc: 0.999527	valid_1's auc: 0.974274
[4000]	training's auc: 0.999998	valid_1's auc: 0.97666
Early stopping, best iteration is:
[5143]	training's auc: 1	valid_1's auc: 0.976826
training base classifier 2
Training until validation scores don't improve for 500 rounds
[2000]	training's auc: 0.999505	valid_1's auc: 0.975836
[4000]	training's auc: 0.999997	valid_1's auc: 0.977785
Early stopping, best iteration is:
[3967]	training's auc: 0.999997	valid_1's auc: 0.977801
training base classifier 3
Training until validation scores don't improve for 500 rounds
[20

## load and preprocess test data

In [14]:
%%time

dataframe, transaction_id_data = load_test_data()
dataframe = preprocess(dataframe)

dataframe.head()

Mem. usage decreased to 472.59 Mb (68.9% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)
number of rows in test data: 506691
browsers: [nan 'chrome' 'ie' 'safari' 'edge' 'firefox' 'samsung' 'other' 'opera'
 'palemoon']
Mem. usage decreased to 335.06 Mb (7.1% reduction)
CPU times: user 59.7 s, sys: 30.8 s, total: 1min 30s
Wall time: 1min 30s


Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,D15_to_mean_addr1,D15_to_mean_addr2,D15_to_std_addr1,D15_to_std_addr2,P_emaildomain_1,P_emaildomain_2,P_emaildomain_3,R_emaildomain_1,R_emaildomain_2,R_emaildomain_3
0,18403224,31.953125,W,10409,111.0,150.0,visa,226.0,debit,170.0,...,1.557617,1.759766,0.0,0.0,15,com,,25,,
1,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,299.0,...,3.033203,2.728516,0.0,0.0,2,com,,25,,
2,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,472.0,...,0.457031,0.41748,0.0,0.0,17,com,,25,,
3,18403310,285.0,W,10989,360.0,150.0,visa,166.0,debit,205.0,...,1.020508,1.041016,0.0,0.0,15,com,,25,,
4,18403317,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,...,0.098999,0.094666,0.0,0.0,15,com,,25,,


## do inference and get output

In [17]:
%%time

print(f'doing inference using base classifier 0')
classifier = joblib.load(model_dir_path + 'base_0.joblib')  # load model

prediction = classifier.predict(dataframe)
del classifier

for index in range(1, num_base_classifiers):
    print(f'doing inference using base classifier {index}')

    classifier = joblib.load(model_dir_path + 'base_' + str(index) + '.joblib')  # load model
    prediction += classifier.predict(dataframe)

    if (index + 1) / 7 == 0:
        time.sleep(60)  # let my computer cool down, or it may overheat

del dataframe
prediction /= num_base_classifiers

output_dataframe = pd.DataFrame({
    'TransactionID': transaction_id_data,
    'isFraud': pd.Series(prediction),
})
output_dataframe.to_csv(output_path, index=False)

output_dataframe.head()

doing inference using base classifier 0
doing inference using base classifier 1
doing inference using base classifier 2
doing inference using base classifier 3
doing inference using base classifier 4
doing inference using base classifier 5
doing inference using base classifier 6
doing inference using base classifier 7
doing inference using base classifier 8
doing inference using base classifier 9
doing inference using base classifier 10
doing inference using base classifier 11
doing inference using base classifier 12
doing inference using base classifier 13
doing inference using base classifier 14
doing inference using base classifier 15
doing inference using base classifier 16
doing inference using base classifier 17
doing inference using base classifier 18
doing inference using base classifier 19
CPU times: user 11h 9min 32s, sys: 1min 27s, total: 11h 11min
Wall time: 1h 5min 24s


Unnamed: 0,TransactionID,isFraud
0,3663549,0.000225
1,3663550,0.00019
2,3663551,0.000584
3,3663552,0.00065
4,3663553,0.000158


## AUC

- Kaggle public score: 0.943850
  - leaderboard rank: 2744 out of 6381 (~43.00%)
- Kaggle private score: 0.917212
  - leaderboard rank: 2451 out of 6381 (~38.41%)