# check if normalisation improves performance

previous feature engineering:

- categorical features representation

## import libraries

In [0]:
import joblib
import os
import lightgbm as lgb
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## change working directory

In [0]:
os.chdir('..')

## define paths

In [0]:
model_dir_path = 'models/fe/'
output_dir_path = 'output/fe/'

os.makedirs(model_dir_path, exist_ok=True)
os.makedirs(output_dir_path, exist_ok=True)

train_transaction_data_path = 'data/train_transaction.csv'
train_identity_data_path = 'data/train_identity.csv'
test_transaction_data_path = 'data/test_transaction.csv'
test_identity_data_path = 'data/test_identity.csv'

## define utility function to reduce memory usage

In [0]:
def reduce_mem_usage(df, verbose=True):
    """
    Reduce dataframe size

    params:
    - df: dataframe to reduce the size of

    return:
    - dataframe of reduced size
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'float128']
    start_mem = df.memory_usage().sum() / 1024**2    

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
                elif c_min > np.finfo(np.float128).min and c_max < np.finfo(np.float128).max:
                    df[col] = df[col].astype(np.float128)
                    
    end_mem = df.memory_usage().sum() / 1024**2

    if verbose: 
        print(
            'Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem
        ))

    return df

## list down useless features (known from feature selection)

In [0]:
useless_features = [
    'TransactionID',  # not really a feature
    'dist2',  # transaction features
    'C3',  # C features
    'D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14',  # D features
    'M1',  # M features
    'id_07', 'id_08', 'id_18', 'id_21', 'id_22', 'id_23',  # id features
    'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_35',  # id features
    'V6', 'V8', 'V9', 'V10', 'V11', 'V14', 'V15', 'V16',  # V features
    'V18', 'V21', 'V22', 'V27', 'V28', 'V31', 'V32',  # V features
    'V41', 'V42', 'V46', 'V50', 'V51', 'V59', 'V65',  # V features
    'V68', 'V71', 'V72', 'V79', 'V80', 'V84', 'V85',  # V features
    'V88', 'V89', 'V92', 'V93', 'V95', 'V98', 'V101',  # V features
    'V104', 'V106', 'V107', 'V108', 'V109', 'V110',  # V features
    'V111', 'V112', 'V113', 'V114', 'V116', 'V117',  # V features
    'V118', 'V119', 'V120', 'V121', 'V122', 'V123',  # V features 
    'V125', 'V138', 'V141', 'V142', 'V144', 'V146',  # V features 
    'V147', 'V148', 'V151', 'V153', 'V154', 'V155',  # V features 
    'V157', 'V158', 'V159', 'V161', 'V163', 'V164',  # V features 
    'V166', 'V172', 'V173', 'V174', 'V175', 'V176',  # V features 
    'V177', 'V178', 'V179', 'V180', 'V181', 'V182',  # V features  
    'V183', 'V184', 'V185', 'V186', 'V190', 'V191',  # V features  
    'V192', 'V193', 'V194', 'V195', 'V196', 'V197',  # V features  
    'V198', 'V199', 'V214', 'V216', 'V220', 'V225',  # V features 
    'V226', 'V227', 'V230', 'V233', 'V235', 'V236',  # V features  
    'V237', 'V238', 'V239', 'V240', 'V241', 'V242',  # V features 
    'V244', 'V246', 'V247', 'V248', 'V249', 'V250',  # V features 
    'V252', 'V254', 'V255', 'V269', 'V276', 'V297',  # V features 
    'V300', 'V302', 'V304', 'V305', 'V325', 'V327',  # V features  
    'V328', 'V329', 'V330', 'V334', 'V335', 'V336',  # V features 
    'V337', 'V338', 'V339',  # V features 
]


## define function to load training data

- also drops useless features

In [0]:
def load_training_data() -> pd.DataFrame:
    transaction_dataframe = pd.read_csv(train_transaction_data_path)
    transaction_dataframe = reduce_mem_usage(transaction_dataframe)

    identity_dataframe = pd.read_csv(train_identity_data_path)
    identity_dataframe = reduce_mem_usage(identity_dataframe)

    dataframe = transaction_dataframe.merge(identity_dataframe, how='outer')
    dataframe = dataframe.drop(useless_features, axis=1)

    print(f'number of rows in training data: {len(dataframe)}')
    return dataframe

## define function to load test data

- also drops useless features

In [0]:
def load_test_data():
    transaction_dataframe = pd.read_csv(test_transaction_data_path)
    transaction_dataframe = reduce_mem_usage(transaction_dataframe)

    identity_dataframe = pd.read_csv(test_identity_data_path)
    identity_dataframe = reduce_mem_usage(identity_dataframe)
    identity_dataframe = identity_dataframe.rename(
        columns={
            column: column.replace('-', '_')
            for column in identity_dataframe.columns
        }
    )

    dataframe = transaction_dataframe.merge(identity_dataframe, how='outer')
    transaction_id_data = dataframe['TransactionID']  # need it for output

    dataframe = dataframe.drop(useless_features, axis=1)

    print(f'number of rows in test data: {len(dataframe)}')
    return dataframe, transaction_id_data

## define function to train LightGBM

- using [reference notebook parameters](https://www.kaggle.com/nroman/lgb-single-model-lb-0-9419)

In [0]:
def train_lgbm(df: pd.DataFrame) -> lgb.Booster:
    """
    params:
    - df (pd.DataFrame): data to be used to train lgbm

    return:
    - trained lightgbm classifier
    """
    is_fraud_data = dataframe['isFraud']
    features_dataframe = dataframe.drop('isFraud', axis=1)

    train_features, val_features, train_target, val_target = train_test_split(
        features_dataframe, 
        is_fraud_data, 
        test_size=0.1,
    )

    del features_dataframe
    del is_fraud_data

    train_data = lgb.Dataset(train_features, train_target)
    val_data = lgb.Dataset(val_features, val_target)

    del train_features
    del train_target

    params = {
        'num_leaves': 491,
        'min_child_weight': 0.03454472573214212,
        'feature_fraction': 0.3797454081646243,
        'bagging_fraction': 0.4181193142567742,
        'min_data_in_leaf': 106,
        'objective': 'binary',
        'max_depth': -1,
        'learning_rate': 0.006883242363721497,
        'boosting_type': 'gbdt',
        'bagging_seed': 11,
        'metric': 'auc',
        'verbosity': -1,
        'reg_alpha': 0.3899927210061127,
        'reg_lambda': 0.6485237330340494,
        'random_state': 47,
    }

    classifier = lgb.train(
        params, 
        train_set=train_data, 
        num_boost_round=10000, 
        valid_sets=[train_data, val_data],
        verbose_eval=1000,
        early_stopping_rounds=500,
    )

    prediction = classifier.predict(val_features)
    auc = roc_auc_score(val_target, prediction)
    print(f'AUC: {auc}')

    return classifier

## define function for performing inference

In [0]:
def inference(classifier, test_df, transaction_id_data) -> pd.DataFrame:
    """
    params:
    - classifier: used to perform inference on test data
    - test_df (pd.DataFrame): dataframe that contains test data
    - transaction_id_data: the TransactionID column

    return:
    - dataframe in the right format for Kaggle submission
    """
    prediction = classifier.predict(test_df)

    return pd.DataFrame({
        'TransactionID': transaction_id_data,
        'isFraud': pd.Series(prediction),
    })

## load training data

In [0]:
%%time

dataframe = load_training_data()
dataframe.head()

Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
number of rows in training data: 590540
CPU times: user 54.4 s, sys: 36.9 s, total: 1min 31s
Wall time: 1min 31s


Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## no normalisation (for numerical values)

- along with label encoding for features with high cardinality

In [0]:
num_categories_cutoff = 100
le = LabelEncoder()

for column in dataframe.columns:
    if dataframe[column].dtype != 'object':
        continue

    if dataframe[column].nunique() <= num_categories_cutoff:
        dataframe[column]= dataframe[column].astype('category')
    else:
        dataframe[column] = dataframe[column].astype(str)
        dataframe[column] = le.fit_transform(dataframe[column])

dataframe = reduce_mem_usage(dataframe)

Mem. usage decreased to 337.92 Mb (2.9% reduction)


## train model

In [0]:
classifier = train_lgbm(dataframe)
joblib.dump(classifier, model_dir_path + 'no_norm.joblib')  # save model

Training until validation scores don't improve for 500 rounds
[1000]	training's auc: 0.993329	valid_1's auc: 0.969582
[2000]	training's auc: 0.999289	valid_1's auc: 0.976216
[3000]	training's auc: 0.999912	valid_1's auc: 0.977733
[4000]	training's auc: 0.999991	valid_1's auc: 0.977855
Early stopping, best iteration is:
[3626]	training's auc: 0.999978	valid_1's auc: 0.977915
AUC: 0.9779150879543753


['models/fe/no_norm.joblib']

## load test data

In [0]:
%%time

dataframe, transaction_id_data = load_test_data()
dataframe.head()

Mem. usage decreased to 472.59 Mb (68.9% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)
number of rows in test data: 506691
CPU times: user 44.3 s, sys: 30 s, total: 1min 14s
Wall time: 1min 14s


Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,18403224,31.953125,W,10409,111.0,150.0,visa,226.0,debit,170.0,...,,,,,,,,,,
1,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,299.0,...,,,,,,,,,,
2,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,472.0,...,,,,,,,,,,
3,18403310,285.0,W,10989,360.0,150.0,visa,166.0,debit,205.0,...,,,,,,,,,,
4,18403317,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,...,,,,,,,,,,


## no normalisation (for numerical values)

- along with label encoding for features with high cardinality

In [0]:
for column in dataframe.columns:
    if dataframe[column].dtype != 'object':
        continue

    if dataframe[column].nunique() <= num_categories_cutoff:
        dataframe[column]= dataframe[column].astype('category')
    else:
        dataframe[column] = dataframe[column].astype(str)
        dataframe[column] = le.fit_transform(dataframe[column])

dataframe = reduce_mem_usage(dataframe)

Mem. usage decreased to 296.22 Mb (2.9% reduction)


## load model

- if model is no longer in memory (e.g. due to restarting of notebook)

In [0]:
classifier = joblib.load(model_dir_path + 'no_norm.joblib')

## do inference and get output

In [0]:
%%time

output_dataframe = inference(classifier, dataframe, transaction_id_data)
output_dataframe.to_csv(output_dir_path + 'no_norm.csv', index=False)

output_dataframe.head()

CPU times: user 26min 56s, sys: 3.27 s, total: 26min 59s
Wall time: 2min 40s


Unnamed: 0,TransactionID,isFraud
0,3663549,0.000623
1,3663550,0.000212
2,3663551,0.000566
3,3663552,0.001075
4,3663553,0.000203


## AUC (no normalisation)

- validation: 0.9779150879543753
- Kaggle public score: 0.937887
- Kaggle private score: 0.906541

## load training data

In [0]:
%%time

dataframe = load_training_data()
dataframe.head()

Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
number of rows in training data: 590540
CPU times: user 53.3 s, sys: 35.7 s, total: 1min 28s
Wall time: 1min 29s


Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## normalisation (for numerical values)

- along with label encoding for features with high cardinality

In [0]:
num_categories_cutoff = 100
le = LabelEncoder()

for column in dataframe.columns:
    if column == 'isFraud':
        continue
    
    if dataframe[column].dtype == 'object':
        if dataframe[column].nunique() <= num_categories_cutoff:
            dataframe[column]= dataframe[column].astype('category')
        else:
            dataframe[column] = dataframe[column].astype(str)
            dataframe[column] = le.fit_transform(dataframe[column])
    else:
        dataframe[column]= dataframe[column].astype(float)

        # normalisation
        temp = dataframe[column] - dataframe[column].mean()
        dataframe[column] = temp / dataframe[column].std() 

dataframe = reduce_mem_usage(dataframe)

Mem. usage decreased to 295.12 Mb (73.8% reduction)


## train model

In [0]:
classifier = train_lgbm(dataframe)
joblib.dump(classifier, model_dir_path + 'norm.joblib')  # save model

Training until validation scores don't improve for 500 rounds
[1000]	training's auc: 0.99338	valid_1's auc: 0.966665
[2000]	training's auc: 0.999211	valid_1's auc: 0.973601
[3000]	training's auc: 0.999884	valid_1's auc: 0.975111
[4000]	training's auc: 0.999988	valid_1's auc: 0.9753
Early stopping, best iteration is:
[3906]	training's auc: 0.999985	valid_1's auc: 0.975335
AUC: 0.9753348359176897


['models/fe/norm.joblib']

## load test data

In [0]:
%%time

dataframe, transaction_id_data = load_test_data()
dataframe.head()

Mem. usage decreased to 472.59 Mb (68.9% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)
number of rows in test data: 506691
CPU times: user 45.3 s, sys: 31.1 s, total: 1min 16s
Wall time: 1min 16s


Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,18403224,31.953125,W,10409,111.0,150.0,visa,226.0,debit,170.0,...,,,,,,,,,,
1,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,299.0,...,,,,,,,,,,
2,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,472.0,...,,,,,,,,,,
3,18403310,285.0,W,10989,360.0,150.0,visa,166.0,debit,205.0,...,,,,,,,,,,
4,18403317,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,...,,,,,,,,,,


## normalisation (for numerical values)

- along with label encoding for features with high cardinality

In [0]:
for column in dataframe.columns:
    if column == 'isFraud':
        continue
    
    if dataframe[column].dtype == 'object':
        if dataframe[column].nunique() <= num_categories_cutoff:
            dataframe[column]= dataframe[column].astype('category')
        else:
            dataframe[column] = dataframe[column].astype(str)
            dataframe[column] = le.fit_transform(dataframe[column])
    else:
        dataframe[column]= dataframe[column].astype(float)

        # normalisation
        temp = dataframe[column] - dataframe[column].mean()
        dataframe[column] = temp / dataframe[column].std() 

dataframe = reduce_mem_usage(dataframe)

Mem. usage decreased to 252.73 Mb (73.8% reduction)


## load model

In [0]:
classifier = joblib.load(model_dir_path + 'norm.joblib')

## do inference and get output

In [0]:
%%time

output_dataframe = inference(classifier, dataframe, transaction_id_data)
output_dataframe.to_csv(output_dir_path + 'norm.csv', index=False)

output_dataframe.head()

CPU times: user 24min 45s, sys: 3.2 s, total: 24min 48s
Wall time: 2min 27s


Unnamed: 0,TransactionID,isFraud
0,3663549,0.00046
1,3663550,0.001501
2,3663551,0.000534
3,3663552,0.000672
4,3663553,0.000269


## AUC (normalisation)

- validation: 0.9753348359176897
- Kaggle public score: 0.906302
- Kaggle private score: 0.859245

## comparison

AUC|no normalisation|normalisation
-|-|-
validation|0.9779150879543753|0.9753348359176897
Kaggle public score|0.937887|0.906302
Kaggle private score|0.906541|0.859245

conclusion: normalisation makes the model performs worse