# do average voting using multiple Light Gradient Boosting Machines

## import libraries

In [1]:
import joblib
import multiprocessing
import os
import lightgbm as lgb
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## change working directory

In [4]:
os.chdir('..')

## define paths

In [2]:
model_dir_path = 'models/lgbm/'
output_dir_path = 'output/lgbm/'

os.makedirs(model_dir_path, exist_ok=True)
os.makedirs(output_dir_path, exist_ok=True)

train_transaction_data_path = 'data/train_transaction.csv'
train_identity_data_path = 'data/train_identity.csv'
test_transaction_data_path = 'data/test_transaction.csv'
test_identity_data_path = 'data/test_identity.csv'

## define utility function to reduce memory usage

In [3]:
def reduce_mem_usage(df, verbose=True):
    """
    Reduce dataframe size

    params:
    - df: dataframe to reduce the size of

    return:
    - dataframe of reduced size
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'float128']
    start_mem = df.memory_usage().sum() / 1024**2    

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
                elif c_min > np.finfo(np.float128).min and c_max < np.finfo(np.float128).max:
                    df[col] = df[col].astype(np.float128)
                    
    end_mem = df.memory_usage().sum() / 1024**2

    if verbose: 
        print(
            'Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem
        ))

    return df

## list down useless features (known from feature selection)

In [4]:
useless_features = [
    'TransactionID',  # not really a feature
    'dist2',  # transaction features
    'C3',  # C features
    'D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14',  # D features
    'M1',  # M features
    'id_07', 'id_08', 'id_18', 'id_21', 'id_22', 'id_23',  # id features
    'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_35',  # id features
    'V6', 'V8', 'V9', 'V10', 'V11', 'V14', 'V15', 'V16',  # V features
    'V18', 'V21', 'V22', 'V27', 'V28', 'V31', 'V32',  # V features
    'V41', 'V42', 'V46', 'V50', 'V51', 'V59', 'V65',  # V features
    'V68', 'V71', 'V72', 'V79', 'V80', 'V84', 'V85',  # V features
    'V88', 'V89', 'V92', 'V93', 'V95', 'V98', 'V101',  # V features
    'V104', 'V106', 'V107', 'V108', 'V109', 'V110',  # V features
    'V111', 'V112', 'V113', 'V114', 'V116', 'V117',  # V features
    'V118', 'V119', 'V120', 'V121', 'V122', 'V123',  # V features 
    'V125', 'V138', 'V141', 'V142', 'V144', 'V146',  # V features 
    'V147', 'V148', 'V151', 'V153', 'V154', 'V155',  # V features 
    'V157', 'V158', 'V159', 'V161', 'V163', 'V164',  # V features 
    'V166', 'V172', 'V173', 'V174', 'V175', 'V176',  # V features 
    'V177', 'V178', 'V179', 'V180', 'V181', 'V182',  # V features  
    'V183', 'V184', 'V185', 'V186', 'V190', 'V191',  # V features  
    'V192', 'V193', 'V194', 'V195', 'V196', 'V197',  # V features  
    'V198', 'V199', 'V214', 'V216', 'V220', 'V225',  # V features 
    'V226', 'V227', 'V230', 'V233', 'V235', 'V236',  # V features  
    'V237', 'V238', 'V239', 'V240', 'V241', 'V242',  # V features 
    'V244', 'V246', 'V247', 'V248', 'V249', 'V250',  # V features 
    'V252', 'V254', 'V255', 'V269', 'V276', 'V297',  # V features 
    'V300', 'V302', 'V304', 'V305', 'V325', 'V327',  # V features  
    'V328', 'V329', 'V330', 'V334', 'V335', 'V336',  # V features 
    'V337', 'V338', 'V339',  # V features 
]


## define function to disregard OS versions

In [5]:
def ignore_os_version(df, verbose: bool=True):
    """
    params:
    - df (DataFrame): has id_30 as one of its columns
    - verbose (bool): prints information if True

    return: dataframe, after os versions have been ignored
    """
    os_list = [
        'Android',
        'iOS',
        'Mac OS X',
        'Windows',
    ]

    for index, operating_system in df.id_30.iteritems():
        new_os = 'other'

        if isinstance(operating_system, str):
            for known_os in os_list:
                if known_os in operating_system:
                    new_os = known_os
                    break

        df.at[index, 'id_30'] = new_os

    if verbose:
        print('operating systems:', df.id_30.unique())

    return df

## define function to disregard browser versions

In [6]:
def ignore_browser_version(df, verbose: bool=True):
    """
    params:
    - df (DataFrame): has id_31 as one of its columns
    - verbose (bool): prints information if True

    return: dataframe, after browser versions have been ignored
    """
    browser_list = [
        'aol',
        'chrome',
        'chromium',
        'comodo',
        'cyberfox',
        'edge',
        'firefox',
        'icedragon',
        'ie',
        'iron',
        'maxthon',
        'opera',
        'palemoon',
        'puffin',
        'safari',
        'samsung',
        'seamonkey',
        'silk',
        'waterfox',
    ]

    for index, browser in df.id_31.iteritems():
        new_browser = 'other'

        if isinstance(browser, str):
            for known_browser in browser_list:
                if known_browser in browser:
                    new_browser = known_browser
                    break

        df.at[index, 'id_31'] = new_browser

    if verbose:
        print('browsers:', df.id_31.unique())

    return df

## define function for preprocessing data

In [7]:
def preprocess(df, verbose: bool=True):
    """
    Does the following preprocessing steps:
    - disregard os versions
    - disregard browser versions
    - drop useless features
    - convert object columns to string columns
    - imputation (for numbers, fill with interquartile mean)
    - do label encoding for non-numeric values
    - reduce memory usage again

    params:   
    - df (DataFrame): dataframe to preprocess (has columns id_30 and id_31)
    - verbose (bool): prints information if True

    return: dataframe, preprocessing is complete
    """
    df = df.drop(useless_features, axis=1)
    df = ignore_os_version(df, verbose)
    df = ignore_browser_version(df, verbose)

    le = LabelEncoder()

    for column in df.columns:
        if df[column].dtype == 'object':
            df[column]= df[column].astype(str)
            df[column] = le.fit_transform(df[column])
        else:
            df[column] = df[column].fillna(df[column].quantile().mean())

    df = reduce_mem_usage(df, verbose)

    return df

## load and preprocess test data

In [8]:
%%time

transaction_dataframe = pd.read_csv(test_transaction_data_path)
transaction_dataframe = reduce_mem_usage(transaction_dataframe)

identity_dataframe = pd.read_csv(test_identity_data_path)
identity_dataframe = reduce_mem_usage(identity_dataframe)
identity_dataframe = identity_dataframe.rename(
    columns={
        column: column.replace('-', '_')
        for column in identity_dataframe.columns
    }
)

dataframe = transaction_dataframe.merge(identity_dataframe, how='outer')
transaction_id_data = dataframe['TransactionID']  # need it for output

del transaction_dataframe
del identity_dataframe

print(f'number of rows in test data: {len(dataframe)}')
dataframe = preprocess(dataframe)

dataframe.head()

Mem. usage decreased to 472.59 Mb (68.9% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)
number of rows in test data: 506691
operating systems: ['other' 'Android' 'iOS' 'Windows' 'Mac OS X']
browsers: ['other' 'chrome' 'ie' 'safari' 'edge' 'firefox' 'samsung' 'opera'
 'palemoon']
Mem. usage decreased to 315.73 Mb (21.6% reduction)
CPU times: user 1min 3s, sys: 31 s, total: 1min 34s
Wall time: 1min 34s


Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,18403224,31.953125,4,10409,111.0,150.0,4,226.0,2,170.0,...,4,5,24.0,390,2,2,2,2,2,2184
1,18403263,49.0,4,4272,111.0,150.0,4,226.0,2,299.0,...,4,5,24.0,390,2,2,2,2,2,2184
2,18403310,171.0,4,4476,574.0,150.0,4,226.0,2,472.0,...,4,5,24.0,390,2,2,2,2,2,2184
3,18403310,285.0,4,10989,360.0,150.0,4,166.0,2,205.0,...,4,5,24.0,390,2,2,2,2,2,2184
4,18403317,67.9375,4,18018,452.0,150.0,2,117.0,2,264.0,...,4,5,24.0,390,2,2,2,2,2,2184


## define number of base classifiers

In [9]:
num_base_classifiers = 20
offset = 0

## load models, do inference and get output

In [10]:
%%time

print('doing inference using lgbm_0')
classifier = joblib.load(model_dir_path + 'lgbm_' + str(0 + offset) + '.joblib')
prediction = classifier.predict(dataframe)

for index in range(1, num_base_classifiers):
    classifier_name = 'lgbm_' + str(index + offset)
    print(f'doing inference using {classifier_name}')

    classifier = joblib.load(model_dir_path + classifier_name + '.joblib')
    base_prediction = classifier.predict(dataframe)
    prediction += base_prediction

prediction /= num_base_classifiers  # equal weightage given to each base classifier

del classifier
del dataframe

output_dataframe = pd.DataFrame({
    'TransactionID': transaction_id_data,
    'isFraud': pd.Series(prediction),
})

output_dataframe.to_csv(output_dir_path + 'avg_vote_lgbm_0_to_19.csv', index=False)
output_dataframe.head()

doing inference using lgbm_0
doing inference using lgbm_1
doing inference using lgbm_2
doing inference using lgbm_3
doing inference using lgbm_4
doing inference using lgbm_5
doing inference using lgbm_6
doing inference using lgbm_7
doing inference using lgbm_8
doing inference using lgbm_9
doing inference using lgbm_10
doing inference using lgbm_11
doing inference using lgbm_12
doing inference using lgbm_13
doing inference using lgbm_14
doing inference using lgbm_15
doing inference using lgbm_16
doing inference using lgbm_17
doing inference using lgbm_18
doing inference using lgbm_19
CPU times: user 5h 55min 7s, sys: 34.6 s, total: 5h 55min 41s
Wall time: 33min 37s


Unnamed: 0,TransactionID,isFraud
0,3663549,0.006293
1,3663550,0.006006
2,3663551,0.008165
3,3663552,0.006538
4,3663553,0.006053


## Kaggle result

- public score: 0.941296
- public ranking: 3173 out of 6381 (~49.72%)
- private score: 0.916027
- private ranking: 2558 out of 6381 (~40.08%)