# check which way of representing categorical features lead to best performance

## import libraries

In [1]:
import joblib
import os
import lightgbm as lgb
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## change working directory

In [2]:
os.chdir('..')

## define paths

In [3]:
model_dir_path = 'models/fe/'
output_dir_path = 'output/fe/'

os.makedirs(model_dir_path, exist_ok=True)
os.makedirs(output_dir_path, exist_ok=True)

train_transaction_data_path = 'data/train_transaction.csv'
train_identity_data_path = 'data/train_identity.csv'
test_transaction_data_path = 'data/test_transaction.csv'
test_identity_data_path = 'data/test_identity.csv'

## define utility function to reduce memory usage

In [4]:
def reduce_mem_usage(df, verbose=True):
    """
    Reduce dataframe size

    params:
    - df: dataframe to reduce the size of

    return:
    - dataframe of reduced size
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'float128']
    start_mem = df.memory_usage().sum() / 1024**2    

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
                elif c_min > np.finfo(np.float128).min and c_max < np.finfo(np.float128).max:
                    df[col] = df[col].astype(np.float128)
                    
    end_mem = df.memory_usage().sum() / 1024**2

    if verbose: 
        print(
            'Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem
        ))

    return df

## list down useless features (known from feature selection)

In [5]:
useless_features = [
    'TransactionID',  # not really a feature
    'dist2',  # transaction features
    'C3',  # C features
    'D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14',  # D features
    'M1',  # M features
    'id_07', 'id_08', 'id_18', 'id_21', 'id_22', 'id_23',  # id features
    'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_35',  # id features
    'V6', 'V8', 'V9', 'V10', 'V11', 'V14', 'V15', 'V16',  # V features
    'V18', 'V21', 'V22', 'V27', 'V28', 'V31', 'V32',  # V features
    'V41', 'V42', 'V46', 'V50', 'V51', 'V59', 'V65',  # V features
    'V68', 'V71', 'V72', 'V79', 'V80', 'V84', 'V85',  # V features
    'V88', 'V89', 'V92', 'V93', 'V95', 'V98', 'V101',  # V features
    'V104', 'V106', 'V107', 'V108', 'V109', 'V110',  # V features
    'V111', 'V112', 'V113', 'V114', 'V116', 'V117',  # V features
    'V118', 'V119', 'V120', 'V121', 'V122', 'V123',  # V features 
    'V125', 'V138', 'V141', 'V142', 'V144', 'V146',  # V features 
    'V147', 'V148', 'V151', 'V153', 'V154', 'V155',  # V features 
    'V157', 'V158', 'V159', 'V161', 'V163', 'V164',  # V features 
    'V166', 'V172', 'V173', 'V174', 'V175', 'V176',  # V features 
    'V177', 'V178', 'V179', 'V180', 'V181', 'V182',  # V features  
    'V183', 'V184', 'V185', 'V186', 'V190', 'V191',  # V features  
    'V192', 'V193', 'V194', 'V195', 'V196', 'V197',  # V features  
    'V198', 'V199', 'V214', 'V216', 'V220', 'V225',  # V features 
    'V226', 'V227', 'V230', 'V233', 'V235', 'V236',  # V features  
    'V237', 'V238', 'V239', 'V240', 'V241', 'V242',  # V features 
    'V244', 'V246', 'V247', 'V248', 'V249', 'V250',  # V features 
    'V252', 'V254', 'V255', 'V269', 'V276', 'V297',  # V features 
    'V300', 'V302', 'V304', 'V305', 'V325', 'V327',  # V features  
    'V328', 'V329', 'V330', 'V334', 'V335', 'V336',  # V features 
    'V337', 'V338', 'V339',  # V features 
]


## define function to load training data

- also drops useless features

In [6]:
def load_training_data() -> pd.DataFrame:
    transaction_dataframe = pd.read_csv(train_transaction_data_path)
    transaction_dataframe = reduce_mem_usage(transaction_dataframe)

    identity_dataframe = pd.read_csv(train_identity_data_path)
    identity_dataframe = reduce_mem_usage(identity_dataframe)

    dataframe = transaction_dataframe.merge(identity_dataframe, how='outer')
    dataframe = dataframe.drop(useless_features, axis=1)

    print(f'number of rows in training data: {len(dataframe)}')
    return dataframe

## define function to load test data

- also drops useless features

In [79]:
def load_test_data():
    transaction_dataframe = pd.read_csv(test_transaction_data_path)
    transaction_dataframe = reduce_mem_usage(transaction_dataframe)

    identity_dataframe = pd.read_csv(test_identity_data_path)
    identity_dataframe = reduce_mem_usage(identity_dataframe)
    identity_dataframe = identity_dataframe.rename(
        columns={
            column: column.replace('-', '_')
            for column in identity_dataframe.columns
        }
    )

    dataframe = transaction_dataframe.merge(identity_dataframe, how='outer')
    transaction_id_data = dataframe['TransactionID']  # need it for output

    dataframe = dataframe.drop(useless_features, axis=1)

    print(f'number of rows in test data: {len(dataframe)}')
    return dataframe, transaction_id_data

## define function to train LightGBM

- using [reference notebook parameters](https://www.kaggle.com/nroman/lgb-single-model-lb-0-9419)

In [8]:
def train_lgbm(df: pd.DataFrame) -> lgb.Booster:
    """
    params:
    - df (pd.DataFrame): data to be used to train lgbm

    return:
    - trained lightgbm classifier
    """
    is_fraud_data = dataframe['isFraud']
    features_dataframe = dataframe.drop('isFraud', axis=1)

    train_features, val_features, train_target, val_target = train_test_split(
        features_dataframe, 
        is_fraud_data, 
        test_size=0.1,
    )

    del features_dataframe
    del is_fraud_data

    train_data = lgb.Dataset(train_features, train_target)
    val_data = lgb.Dataset(val_features, val_target)

    del train_features
    del train_target

    params = {
        'num_leaves': 491,
        'min_child_weight': 0.03454472573214212,
        'feature_fraction': 0.3797454081646243,
        'bagging_fraction': 0.4181193142567742,
        'min_data_in_leaf': 106,
        'objective': 'binary',
        'max_depth': -1,
        'learning_rate': 0.006883242363721497,
        'boosting_type': 'gbdt',
        'bagging_seed': 11,
        'metric': 'auc',
        'verbosity': -1,
        'reg_alpha': 0.3899927210061127,
        'reg_lambda': 0.6485237330340494,
        'random_state': 47,
    }

    classifier = lgb.train(
        params, 
        train_set=train_data, 
        num_boost_round=10000, 
        valid_sets=[train_data, val_data],
        verbose_eval=1000,
        early_stopping_rounds=500,
    )

    prediction = classifier.predict(val_features)
    auc = roc_auc_score(val_target, prediction)
    print(f'AUC: {auc}')

    return classifier

## define function for performing inference

In [56]:
def inference(classifier, test_df, transaction_id_data) -> pd.DataFrame:
    """
    params:
    - classifier: used to perform inference on test data
    - test_df (pd.DataFrame): dataframe that contains test data
    - transaction_id_data: the TransactionID column

    return:
    - dataframe in the right format for Kaggle submission
    """
    prediction = classifier.predict(test_df)

    return pd.DataFrame({
        'TransactionID': transaction_id_data,
        'isFraud': pd.Series(prediction),
    })

## load training data

In [9]:
%%time

dataframe = load_training_data()
dataframe.head()

Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
number of rows in training data: 590540
CPU times: user 54 s, sys: 35.2 s, total: 1min 29s
Wall time: 1min 29s


Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## let categorical features remain as categorical features

In [10]:
for column in dataframe.columns:
    if dataframe[column].dtype == 'object':
        dataframe[column]= dataframe[column].astype('category')

## train model

In [11]:
classifier = train_lgbm(dataframe)
joblib.dump(classifier, model_dir_path + 'cat_feat.joblib')

Training until validation scores don't improve for 500 rounds
[1000]	training's auc: 0.994119	valid_1's auc: 0.967902
[2000]	training's auc: 0.999442	valid_1's auc: 0.974553
[3000]	training's auc: 0.99994	valid_1's auc: 0.975963
[4000]	training's auc: 0.999994	valid_1's auc: 0.976258
Early stopping, best iteration is:
[4480]	training's auc: 0.999998	valid_1's auc: 0.976294
AUC: 0.9762942197546863


['models/fe/cat_feat.joblib']

## load test data

In [30]:
%%time

dataframe, transaction_id_data = load_test_data()
dataframe.head()

Mem. usage decreased to 472.59 Mb (68.9% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)
number of rows in test data: 506691
CPU times: user 45.8 s, sys: 32.7 s, total: 1min 18s
Wall time: 1min 18s


Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,3663549,18403224,31.953125,W,10409,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,,,,,,,,,,
3,3663552,18403310,285.0,W,10989,360.0,150.0,visa,166.0,debit,...,,,,,,,,,,
4,3663553,18403317,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,


## keep categorical features as category type

In [31]:
for column in dataframe.columns:
    if dataframe[column].dtype == 'object':
        dataframe[column]= dataframe[column].astype('category')

## load model

- if model is no longer in memory (e.g. due to restarting of notebook)

In [32]:
classifier = joblib.load(model_dir_path + 'cat_feat.joblib')

## do inference and get output

In [33]:
%%time

output_dataframe = inference(classifier, dataframe, transaction_id_data)
output_dataframe.to_csv(output_dir_path + 'cat_feat.csv', index=False)

output_dataframe.head()

CPU times: user 38min 49s, sys: 6.67 s, total: 38min 56s
Wall time: 3min 42s


Unnamed: 0,TransactionID,isFraud
0,3663549,0.000471
1,3663550,0.000147
2,3663551,0.000318
3,3663552,0.000589
4,3663553,9.3e-05


## AUC (for using categorical features as they are)

- validation: 0.9762942197546863
- Kaggle public score: 0.936043
- Kaggle private score: 0.909225

## load training data

In [69]:
%%time

dataframe = load_training_data()
dataframe.head()

Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
number of rows in training data: 590540
CPU times: user 53.1 s, sys: 37.9 s, total: 1min 30s
Wall time: 1min 31s


Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## use label encoding

In [70]:
le = LabelEncoder()

for column in dataframe.columns:
    if dataframe[column].dtype == 'object':
        dataframe[column] = dataframe[column].astype(str)
        dataframe[column] = le.fit_transform(dataframe[column])

dataframe = reduce_mem_usage(dataframe)
print(list(dataframe.select_dtypes(include=['category']).columns))

Mem. usage decreased to 337.91 Mb (23.0% reduction)
[]


## train model

In [72]:
classifier = train_lgbm(dataframe)
joblib.dump(classifier, model_dir_path + 'le.joblib')

Training until validation scores don't improve for 500 rounds
[1000]	training's auc: 0.993064	valid_1's auc: 0.965126
[2000]	training's auc: 0.999142	valid_1's auc: 0.971808
[3000]	training's auc: 0.999866	valid_1's auc: 0.973546
[4000]	training's auc: 0.999987	valid_1's auc: 0.974166
[5000]	training's auc: 0.999999	valid_1's auc: 0.97459
[6000]	training's auc: 1	valid_1's auc: 0.974814
Early stopping, best iteration is:
[5948]	training's auc: 1	valid_1's auc: 0.974817
AUC: 0.9748172564228494


['models/fe/le.joblib']

## load test data

In [73]:
%%time

dataframe, transaction_id_data = load_test_data()
dataframe.head()

Mem. usage decreased to 472.59 Mb (68.9% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)
number of rows in test data: 506691
CPU times: user 44.2 s, sys: 32.1 s, total: 1min 16s
Wall time: 1min 16s


Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,3663549,18403224,31.953125,W,10409,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,,,,,,,,,,
3,3663552,18403310,285.0,W,10989,360.0,150.0,visa,166.0,debit,...,,,,,,,,,,
4,3663553,18403317,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,


## use label encoding for test data

In [74]:
for column in dataframe.columns:
    if dataframe[column].dtype == 'object':
        dataframe[column] = dataframe[column].astype(str)
        dataframe[column] = le.fit_transform(dataframe[column])

dataframe = reduce_mem_usage(dataframe)
print(list(dataframe.select_dtypes(include=['category']).columns))

Mem. usage decreased to 296.21 Mb (22.6% reduction)
[]


## load model

- if model is no longer in memory (e.g. due to restarting of notebook)

In [75]:
classifier = joblib.load(model_dir_path + 'le.joblib')
print(classifier.pandas_categorical)

[]


## do inference and get output

In [76]:
%%time

output_dataframe = inference(classifier, dataframe, transaction_id_data)
output_dataframe.to_csv(output_dir_path + 'le.csv', index=False)

output_dataframe.head()

CPU times: user 38min 15s, sys: 5.05 s, total: 38min 20s
Wall time: 3min 40s


Unnamed: 0,TransactionID,isFraud
0,3663549,0.000332
1,3663550,0.000113
2,3663551,0.000393
3,3663552,0.000223
4,3663553,6e-05


## AUC (for using label encoding)

- validation: 0.9748172564228494
- Kaggle public score: 0.937378
- Kaggle private score: 0.905007

## load training data

In [77]:
%%time

dataframe = load_training_data()
dataframe.head()

Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
number of rows in training data: 590540
CPU times: user 53.7 s, sys: 39.9 s, total: 1min 33s
Wall time: 1min 34s


Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## check number of distinct values for each categorical feature

In [78]:
for column in dataframe.columns:
    if dataframe[column].dtype == 'object':
        print(f'{column} - {dataframe[column].nunique()}')

ProductCD - 5
card4 - 4
card6 - 4
P_emaildomain - 59
R_emaildomain - 60
M2 - 2
M3 - 2
M4 - 3
M5 - 2
M6 - 2
M7 - 2
M8 - 2
M9 - 2
id_12 - 2
id_15 - 3
id_16 - 2
id_29 - 2
id_30 - 75
id_31 - 130
id_33 - 260
id_34 - 4
id_36 - 2
id_37 - 2
id_38 - 2
DeviceType - 2
DeviceInfo - 1786


## use label encoding only for features with high cardinality

- [reference](https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#categorical-feature-support)

In [80]:
num_categories_cutoff = 10

for column in dataframe.columns:
    if dataframe[column].dtype != 'object':
        continue

    if dataframe[column].nunique() <= num_categories_cutoff:
        dataframe[column]= dataframe[column].astype('category')
    else:
        dataframe[column] = dataframe[column].astype(str)
        dataframe[column] = le.fit_transform(dataframe[column])

dataframe = reduce_mem_usage(dataframe)
print('categorical features:', list(dataframe.select_dtypes(include=['category']).columns))

Mem. usage decreased to 337.91 Mb (6.1% reduction)
categorical features: ['ProductCD', 'card4', 'card6', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15', 'id_16', 'id_29', 'id_34', 'id_36', 'id_37', 'id_38', 'DeviceType']


## train model

In [81]:
classifier = train_lgbm(dataframe)
joblib.dump(classifier, model_dir_path + 'cat_and_le.joblib')

Training until validation scores don't improve for 500 rounds
[1000]	training's auc: 0.993317	valid_1's auc: 0.964702
[2000]	training's auc: 0.999219	valid_1's auc: 0.971408
[3000]	training's auc: 0.99989	valid_1's auc: 0.972992
[4000]	training's auc: 0.999988	valid_1's auc: 0.973228
[5000]	training's auc: 0.999999	valid_1's auc: 0.973216
Early stopping, best iteration is:
[4770]	training's auc: 0.999998	valid_1's auc: 0.973303
AUC: 0.9733034625308092


['models/fe/cat_and_le.joblib']

## load test data

In [82]:
%%time

dataframe, transaction_id_data = load_test_data()
dataframe.head()

Mem. usage decreased to 472.59 Mb (68.9% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)
number of rows in test data: 506691
CPU times: user 45 s, sys: 31.2 s, total: 1min 16s
Wall time: 1min 16s


Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,18403224,31.953125,W,10409,111.0,150.0,visa,226.0,debit,170.0,...,,,,,,,,,,
1,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,299.0,...,,,,,,,,,,
2,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,472.0,...,,,,,,,,,,
3,18403310,285.0,W,10989,360.0,150.0,visa,166.0,debit,205.0,...,,,,,,,,,,
4,18403317,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,...,,,,,,,,,,


## use label encoding only for features with high cardinality

In [83]:
for column in dataframe.columns:
    if dataframe[column].dtype != 'object':
        continue

    if dataframe[column].nunique() <= num_categories_cutoff:
        dataframe[column]= dataframe[column].astype('category')
    else:
        dataframe[column] = dataframe[column].astype(str)
        dataframe[column] = le.fit_transform(dataframe[column])

dataframe = reduce_mem_usage(dataframe)
print('categorical features:', list(dataframe.select_dtypes(include=['category']).columns))

Mem. usage decreased to 296.21 Mb (6.0% reduction)
categorical features: ['ProductCD', 'card4', 'card6', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15', 'id_16', 'id_29', 'id_34', 'id_36', 'id_37', 'id_38', 'DeviceType']


## load model

- if model is no longer in memory (e.g. due to restarting of notebook)

In [84]:
classifier = joblib.load(model_dir_path + 'cat_and_le.joblib')
print('model categorical features:', classifier.pandas_categorical)

model categorical features: [['C', 'H', 'R', 'S', 'W'], ['american express', 'discover', 'mastercard', 'visa'], ['charge card', 'credit', 'debit', 'debit or credit'], ['F', 'T'], ['F', 'T'], ['M0', 'M1', 'M2'], ['F', 'T'], ['F', 'T'], ['F', 'T'], ['F', 'T'], ['F', 'T'], ['Found', 'NotFound'], ['Found', 'New', 'Unknown'], ['Found', 'NotFound'], ['Found', 'NotFound'], ['match_status:-1', 'match_status:0', 'match_status:1', 'match_status:2'], ['F', 'T'], ['F', 'T'], ['F', 'T'], ['desktop', 'mobile']]


## do inference and get output

In [85]:
%%time

output_dataframe = inference(classifier, dataframe, transaction_id_data)
output_dataframe.to_csv(output_dir_path + 'cat_and_le.csv', index=False)

output_dataframe.head()

CPU times: user 35min 18s, sys: 5.84 s, total: 35min 24s
Wall time: 3min 17s


Unnamed: 0,TransactionID,isFraud
0,3663549,0.000411
1,3663550,0.000201
2,3663551,0.000322
3,3663552,0.000456
4,3663553,0.000127


## AUC (for using a mixture of representations)

- validation: 0.9733034625308092
- Kaggle public score: 0.937547
- Kaggle private score: 0.907447

## comparison

AUC|categorical|label encoding|mixture
-|-|-|-
validation|0.9762942197546863|0.9748172564228494|0.9733034625308092
Kaggle public score|0.936043|0.937378|0.937547
Kaggle private score|0.909225|0.905007|0.907447

conclusion: I think mixture is still the best, but needs a higher cutoff for number of categories to do label encoding