# feature engineering for TransactionDT for LightGBM

previous feature engineering:

- categorical features representation
- check if normalisation helps
- handling nan values
- keep or ignore os and/or browser versions

## import libraries

In [1]:
import joblib
import os
import lightgbm as lgb
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## change working directory

In [2]:
os.chdir('..')

## define paths

In [3]:
model_dir_path = 'models/fe/'
output_dir_path = 'output/fe/'

os.makedirs(model_dir_path, exist_ok=True)
os.makedirs(output_dir_path, exist_ok=True)

train_transaction_data_path = 'data/train_transaction.csv'
train_identity_data_path = 'data/train_identity.csv'
test_transaction_data_path = 'data/test_transaction.csv'
test_identity_data_path = 'data/test_identity.csv'

## define utility function to reduce memory usage

In [4]:
def reduce_mem_usage(df, verbose=True):
    """
    Reduce dataframe size

    params:
    - df: dataframe to reduce the size of

    return:
    - dataframe of reduced size
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'float128']
    start_mem = df.memory_usage().sum() / 1024**2    

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
                elif c_min > np.finfo(np.float128).min and c_max < np.finfo(np.float128).max:
                    df[col] = df[col].astype(np.float128)
                    
    end_mem = df.memory_usage().sum() / 1024**2

    if verbose: 
        print(
            'Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem
        ))

    return df

## list down useless features (known from feature selection)

In [5]:
useless_features = [
    'TransactionID',  # not really a feature
    'dist2',  # transaction features
    'C3',  # C features
    'D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14',  # D features
    'M1',  # M features
    'id_07', 'id_08', 'id_18', 'id_21', 'id_22', 'id_23',  # id features
    'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_35',  # id features
    'V6', 'V8', 'V9', 'V10', 'V11', 'V14', 'V15', 'V16',  # V features
    'V18', 'V21', 'V22', 'V27', 'V28', 'V31', 'V32',  # V features
    'V41', 'V42', 'V46', 'V50', 'V51', 'V59', 'V65',  # V features
    'V68', 'V71', 'V72', 'V79', 'V80', 'V84', 'V85',  # V features
    'V88', 'V89', 'V92', 'V93', 'V95', 'V98', 'V101',  # V features
    'V104', 'V106', 'V107', 'V108', 'V109', 'V110',  # V features
    'V111', 'V112', 'V113', 'V114', 'V116', 'V117',  # V features
    'V118', 'V119', 'V120', 'V121', 'V122', 'V123',  # V features 
    'V125', 'V138', 'V141', 'V142', 'V144', 'V146',  # V features 
    'V147', 'V148', 'V151', 'V153', 'V154', 'V155',  # V features 
    'V157', 'V158', 'V159', 'V161', 'V163', 'V164',  # V features 
    'V166', 'V172', 'V173', 'V174', 'V175', 'V176',  # V features 
    'V177', 'V178', 'V179', 'V180', 'V181', 'V182',  # V features  
    'V183', 'V184', 'V185', 'V186', 'V190', 'V191',  # V features  
    'V192', 'V193', 'V194', 'V195', 'V196', 'V197',  # V features  
    'V198', 'V199', 'V214', 'V216', 'V220', 'V225',  # V features 
    'V226', 'V227', 'V230', 'V233', 'V235', 'V236',  # V features  
    'V237', 'V238', 'V239', 'V240', 'V241', 'V242',  # V features 
    'V244', 'V246', 'V247', 'V248', 'V249', 'V250',  # V features 
    'V252', 'V254', 'V255', 'V269', 'V276', 'V297',  # V features 
    'V300', 'V302', 'V304', 'V305', 'V325', 'V327',  # V features  
    'V328', 'V329', 'V330', 'V334', 'V335', 'V336',  # V features 
    'V337', 'V338', 'V339',  # V features 
]


## define function to load training data

- also drops useless features

In [6]:
def load_training_data() -> pd.DataFrame:
    transaction_dataframe = pd.read_csv(train_transaction_data_path)
    transaction_dataframe = reduce_mem_usage(transaction_dataframe)

    identity_dataframe = pd.read_csv(train_identity_data_path)
    identity_dataframe = reduce_mem_usage(identity_dataframe)

    dataframe = transaction_dataframe.merge(identity_dataframe, how='outer')
    dataframe = dataframe.drop(useless_features, axis=1)

    print(f'number of rows in training data: {len(dataframe)}')
    return dataframe

## define function to load test data

- also drops useless features

In [7]:
def load_test_data():
    transaction_dataframe = pd.read_csv(test_transaction_data_path)
    transaction_dataframe = reduce_mem_usage(transaction_dataframe)

    identity_dataframe = pd.read_csv(test_identity_data_path)
    identity_dataframe = reduce_mem_usage(identity_dataframe)
    identity_dataframe = identity_dataframe.rename(
        columns={
            column: column.replace('-', '_')
            for column in identity_dataframe.columns
        }
    )

    dataframe = transaction_dataframe.merge(identity_dataframe, how='outer')
    transaction_id_data = dataframe['TransactionID']  # need it for output

    dataframe = dataframe.drop(useless_features, axis=1)

    print(f'number of rows in test data: {len(dataframe)}')
    return dataframe, transaction_id_data

## define function to train LightGBM

- using [reference notebook parameters](https://www.kaggle.com/nroman/lgb-single-model-lb-0-9419)

In [8]:
def train_lgbm(df: pd.DataFrame) -> lgb.Booster:
    """
    params:
    - df (pd.DataFrame): data to be used to train lgbm

    return:
    - trained lightgbm classifier
    """
    is_fraud_data = dataframe['isFraud']
    features_dataframe = dataframe.drop('isFraud', axis=1)

    train_features, val_features, train_target, val_target = train_test_split(
        features_dataframe, 
        is_fraud_data, 
        test_size=0.1,
    )

    del features_dataframe
    del is_fraud_data

    train_data = lgb.Dataset(train_features, train_target)
    val_data = lgb.Dataset(val_features, val_target)

    del train_features
    del train_target

    params = {
        'num_leaves': 491,
        'min_child_weight': 0.03454472573214212,
        'feature_fraction': 0.3797454081646243,
        'bagging_fraction': 0.4181193142567742,
        'min_data_in_leaf': 106,
        'objective': 'binary',
        'max_depth': -1,
        'learning_rate': 0.006883242363721497,
        'boosting_type': 'gbdt',
        'bagging_seed': 11,
        'metric': 'auc',
        'verbosity': -1,
        'reg_alpha': 0.3899927210061127,
        'reg_lambda': 0.6485237330340494,
        'random_state': 47,
    }

    classifier = lgb.train(
        params, 
        train_set=train_data, 
        num_boost_round=10000, 
        valid_sets=[train_data, val_data],
        verbose_eval=1000,
        early_stopping_rounds=500,
    )

    prediction = classifier.predict(val_features)
    auc = roc_auc_score(val_target, prediction)
    print(f'AUC: {auc}')

    return classifier

## define function for performing inference

In [9]:
def inference(classifier, test_df, transaction_id_data) -> pd.DataFrame:
    """
    params:
    - classifier: used to perform inference on test data
    - test_df (pd.DataFrame): dataframe that contains test data
    - transaction_id_data: the TransactionID column

    return:
    - dataframe in the right format for Kaggle submission
    """
    prediction = classifier.predict(test_df)

    return pd.DataFrame({
        'TransactionID': transaction_id_data,
        'isFraud': pd.Series(prediction),
    })

## define function to disregard browser versions

In [10]:
def ignore_browser_version(df: pd.DataFrame, verbose: bool=True):
    """
    params:
    - df (pd.DataFrame): has id_31 as one of its columns
    - verbose (bool): prints information if True

    return: dataframe, after browser versions have been ignored
    """
    browser_list = [
        'aol',
        'chrome',
        'chromium',
        'comodo',
        'cyberfox',
        'edge',
        'firefox',
        'icedragon',
        'ie',
        'iron',
        'maxthon',
        'opera',
        'palemoon',
        'puffin',
        'safari',
        'samsung',
        'seamonkey',
        'silk',
        'waterfox',
    ]

    for index, browser in df.id_31.iteritems():
        if not isinstance(browser, str):
            continue  # nan remains as nan

        new_browser = 'other'

        for known_browser in browser_list:
            if known_browser in browser:
                new_browser = known_browser
                break

        df.at[index, 'id_31'] = new_browser

    if verbose:
        print('browsers:', df.id_31.unique())

    return df

## define function for preprocessing data

- except creating day and time features

In [11]:
def preprocess(df: pd.DataFrame, verbose: bool=True) -> pd.DataFrame:
    """
    params:
    - df (pd.DataFrame): dataframe that contains data for preprocessing
    - verbose (bool): prints information if True

    return:
    - processed dataframe
    """
    df = ignore_browser_version(df)

    num_categories_cutoff = 30
    le = LabelEncoder()

    for column in df.columns:
        if df[column].dtype == 'object':
            if df[column].nunique() <= num_categories_cutoff:
                df[column]= df[column].astype('category')
            else:
                df[column] = df[column].astype(str)
                df[column] = le.fit_transform(df[column])
        else:
            df[column] = df[column].fillna(df[column].quantile().mean())

    df = reduce_mem_usage(df)
    return df

## define function to create day (of the week) feature

- [reference](https://www.kaggle.com/fchmiel/day-and-time-powerful-predictive-feature)

In [12]:
def make_day_feature(df, offset=0.58, tname='TransactionDT'):
    """
    Creates a day of the week feature, encoded as 0-6. 
    
    params:
    - df (pd.DataFrame): dataframe to manipulate
    - offset (float): offset (in days) to shift the start/end of a day (default=0.58)
    - tname (str): name of the time column in df (default='TransactionDT')

    return:
    - day column
    """
    days = df[tname] / (3600*24)        
    encoded_days = np.floor(days-1 + offset) % 7

    return encoded_days

## define function to create hour (of the day) feature

- [reference](https://www.kaggle.com/fchmiel/day-and-time-powerful-predictive-feature)

In [13]:
def make_hour_feature(df, tname='TransactionDT'):
    """
    Creates an hour of the day feature, encoded as 0-23. 
    
    params:
    - df (pd.DataFrame): dataframe to manipulate
    - tname (str): name of the time column in df (default='TransactionDT')

    return:
    - hour column
    """
    hours = df[tname] / (3600)        
    encoded_hours = np.floor(hours) % 24

    return encoded_hours

## load training data

In [14]:
%%time

dataframe = load_training_data()
dataframe.head()

Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
number of rows in training data: 590540
CPU times: user 53.1 s, sys: 36.2 s, total: 1min 29s
Wall time: 1min 29s


Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## use TransactionDT as it is

In [15]:
dataframe = preprocess(dataframe)

browsers: [nan 'samsung' 'safari' 'chrome' 'edge' 'firefox' 'ie' 'other' 'opera'
 'aol' 'silk' 'waterfox' 'puffin' 'cyberfox' 'palemoon' 'maxthon' 'iron'
 'seamonkey' 'comodo' 'chromium' 'icedragon']
Mem. usage decreased to 357.35 Mb (4.9% reduction)


## train model

In [16]:
classifier = train_lgbm(dataframe)
joblib.dump(classifier, model_dir_path + 'transaction_dt.joblib')  # save model

Training until validation scores don't improve for 500 rounds
[1000]	training's auc: 0.992411	valid_1's auc: 0.969014
[2000]	training's auc: 0.998925	valid_1's auc: 0.976098
[3000]	training's auc: 0.999804	valid_1's auc: 0.977935
[4000]	training's auc: 0.999977	valid_1's auc: 0.978499
[5000]	training's auc: 0.999998	valid_1's auc: 0.978765
[6000]	training's auc: 1	valid_1's auc: 0.978939
Early stopping, best iteration is:
[6019]	training's auc: 1	valid_1's auc: 0.978945
AUC: 0.978945247092173


['models/fe/transaction_dt.joblib']

## load test data

In [17]:
%%time

dataframe, transaction_id_data = load_test_data()
dataframe.head()

Mem. usage decreased to 472.59 Mb (68.9% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)
number of rows in test data: 506691
CPU times: user 46.5 s, sys: 31.6 s, total: 1min 18s
Wall time: 1min 18s


Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,18403224,31.953125,W,10409,111.0,150.0,visa,226.0,debit,170.0,...,,,,,,,,,,
1,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,299.0,...,,,,,,,,,,
2,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,472.0,...,,,,,,,,,,
3,18403310,285.0,W,10989,360.0,150.0,visa,166.0,debit,205.0,...,,,,,,,,,,
4,18403317,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,...,,,,,,,,,,


## use TransactionDT as it is

In [18]:
dataframe = preprocess(dataframe)

browsers: [nan 'chrome' 'ie' 'safari' 'edge' 'firefox' 'samsung' 'other' 'opera'
 'palemoon']
Mem. usage decreased to 315.73 Mb (4.8% reduction)


## do inference and get output

In [19]:
%%time

classifier = joblib.load(model_dir_path + 'transaction_dt.joblib')  # load model

output_dataframe = inference(classifier, dataframe, transaction_id_data)
output_dataframe.to_csv(output_dir_path + 'transaction_dt.csv', index=False)

output_dataframe.head()

CPU times: user 40min 20s, sys: 5.32 s, total: 40min 25s
Wall time: 3min 53s


Unnamed: 0,TransactionID,isFraud
0,3663549,0.000311
1,3663550,6.3e-05
2,3663551,0.000205
3,3663552,0.000396
4,3663553,6.3e-05


## AUC (using TransactionDT as it is)

- validation: 0.978945247092173
- Kaggle public score: 0.940459
- Kaggle private score: 0.914248

## load training data

In [20]:
%%time

dataframe = load_training_data()
dataframe.head()

Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
number of rows in training data: 590540
CPU times: user 53.9 s, sys: 37 s, total: 1min 30s
Wall time: 1min 31s


Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## extract day from TransactionDT

In [21]:
dataframe['day'] = make_day_feature(dataframe, offset=0.58)
dataframe = dataframe.drop('TransactionDT', axis=1)
dataframe = preprocess(dataframe)

browsers: [nan 'samsung' 'safari' 'chrome' 'edge' 'firefox' 'ie' 'other' 'opera'
 'aol' 'silk' 'waterfox' 'puffin' 'cyberfox' 'palemoon' 'maxthon' 'iron'
 'seamonkey' 'comodo' 'chromium' 'icedragon']
Mem. usage decreased to 356.22 Mb (5.8% reduction)


## train model

In [22]:
classifier = train_lgbm(dataframe)
joblib.dump(classifier, model_dir_path + 'day.joblib')  # save model

Training until validation scores don't improve for 500 rounds
[1000]	training's auc: 0.991258	valid_1's auc: 0.970076
[2000]	training's auc: 0.998263	valid_1's auc: 0.975788
[3000]	training's auc: 0.999553	valid_1's auc: 0.977459
[4000]	training's auc: 0.999907	valid_1's auc: 0.977829
[5000]	training's auc: 0.999983	valid_1's auc: 0.97789
Early stopping, best iteration is:
[4677]	training's auc: 0.999971	valid_1's auc: 0.977912
AUC: 0.9779124794073768


['models/fe/day.joblib']

## load test data

In [23]:
%%time

dataframe, transaction_id_data = load_test_data()
dataframe.head()

Mem. usage decreased to 472.59 Mb (68.9% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)
number of rows in test data: 506691
CPU times: user 43.7 s, sys: 29.5 s, total: 1min 13s
Wall time: 1min 13s


Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,18403224,31.953125,W,10409,111.0,150.0,visa,226.0,debit,170.0,...,,,,,,,,,,
1,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,299.0,...,,,,,,,,,,
2,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,472.0,...,,,,,,,,,,
3,18403310,285.0,W,10989,360.0,150.0,visa,166.0,debit,205.0,...,,,,,,,,,,
4,18403317,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,...,,,,,,,,,,


## extract day from TransactionDT

In [24]:
dataframe['day'] = make_day_feature(dataframe, offset=0.58)
dataframe = dataframe.drop('TransactionDT', axis=1)
dataframe = preprocess(dataframe)

browsers: [nan 'chrome' 'ie' 'safari' 'edge' 'firefox' 'samsung' 'other' 'opera'
 'palemoon']
Mem. usage decreased to 314.77 Mb (5.6% reduction)


## do inference and get output

In [25]:
%%time

classifier = joblib.load(model_dir_path + 'day.joblib')  # load model

output_dataframe = inference(classifier, dataframe, transaction_id_data)
output_dataframe.to_csv(output_dir_path + 'day.csv', index=False)

output_dataframe.head()

CPU times: user 32min 6s, sys: 2.96 s, total: 32min 9s
Wall time: 3min 7s


Unnamed: 0,TransactionID,isFraud
0,3663549,0.000389
1,3663550,9.6e-05
2,3663551,0.000337
3,3663552,0.000201
4,3663553,0.00025


## AUC (extract day from TransactionDT)

- validation: 0.9779124794073768
- Kaggle public score: 0.938442
- Kaggle private score: 0.910745

## load training data

In [26]:
%%time

dataframe = load_training_data()
dataframe.head()

Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
number of rows in training data: 590540
CPU times: user 53.4 s, sys: 37.3 s, total: 1min 30s
Wall time: 1min 30s


Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## extract hour from TransactionDT

In [27]:
dataframe['hour'] = make_hour_feature(dataframe)
dataframe = dataframe.drop('TransactionDT', axis=1)
dataframe = preprocess(dataframe)

browsers: [nan 'samsung' 'safari' 'chrome' 'edge' 'firefox' 'ie' 'other' 'opera'
 'aol' 'silk' 'waterfox' 'puffin' 'cyberfox' 'palemoon' 'maxthon' 'iron'
 'seamonkey' 'comodo' 'chromium' 'icedragon']
Mem. usage decreased to 356.22 Mb (5.8% reduction)


## train model

In [28]:
classifier = train_lgbm(dataframe)
joblib.dump(classifier, model_dir_path + 'hour.joblib')  # save model

Training until validation scores don't improve for 500 rounds
[1000]	training's auc: 0.991817	valid_1's auc: 0.960774
[2000]	training's auc: 0.998488	valid_1's auc: 0.969077
[3000]	training's auc: 0.999636	valid_1's auc: 0.971422
[4000]	training's auc: 0.99993	valid_1's auc: 0.972302
[5000]	training's auc: 0.999986	valid_1's auc: 0.972671
[6000]	training's auc: 0.999998	valid_1's auc: 0.97286
[7000]	training's auc: 1	valid_1's auc: 0.972879
Early stopping, best iteration is:
[6712]	training's auc: 1	valid_1's auc: 0.972903
AUC: 0.9729030850127821


['models/fe/hour.joblib']

## load test data

In [29]:
%%time

dataframe, transaction_id_data = load_test_data()
dataframe.head()

Mem. usage decreased to 472.59 Mb (68.9% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)
number of rows in test data: 506691
CPU times: user 44.6 s, sys: 30.5 s, total: 1min 15s
Wall time: 1min 15s


Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,18403224,31.953125,W,10409,111.0,150.0,visa,226.0,debit,170.0,...,,,,,,,,,,
1,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,299.0,...,,,,,,,,,,
2,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,472.0,...,,,,,,,,,,
3,18403310,285.0,W,10989,360.0,150.0,visa,166.0,debit,205.0,...,,,,,,,,,,
4,18403317,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,...,,,,,,,,,,


## extract hour from TransactionDT

In [30]:
dataframe['hour'] = make_hour_feature(dataframe)
dataframe = dataframe.drop('TransactionDT', axis=1)
dataframe = preprocess(dataframe)

browsers: [nan 'chrome' 'ie' 'safari' 'edge' 'firefox' 'samsung' 'other' 'opera'
 'palemoon']
Mem. usage decreased to 314.77 Mb (5.6% reduction)


## do inference and get output

In [31]:
%%time

classifier = joblib.load(model_dir_path + 'hour.joblib')  # load model

output_dataframe = inference(classifier, dataframe, transaction_id_data)
output_dataframe.to_csv(output_dir_path + 'hour.csv', index=False)

output_dataframe.head()

CPU times: user 48min 52s, sys: 4.25 s, total: 48min 56s
Wall time: 4min 37s


Unnamed: 0,TransactionID,isFraud
0,3663549,0.000239
1,3663550,6.5e-05
2,3663551,0.000161
3,3663552,0.000187
4,3663553,6.8e-05


## AUC (extract hour from TransactionDT)

- validation: 0.9729030850127821
- Kaggle public score: 0.937576
- Kaggle private score: 0.911587

## load training data

In [32]:
%%time

dataframe = load_training_data()
dataframe.head()

Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
number of rows in training data: 590540
CPU times: user 50.4 s, sys: 36.2 s, total: 1min 26s
Wall time: 1min 26s


Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## extract day and hour from TransactionDT

In [33]:
dataframe['day'] = make_day_feature(dataframe, offset=0.58)
dataframe['hour'] = make_hour_feature(dataframe)
dataframe = dataframe.drop('TransactionDT', axis=1)

dataframe = preprocess(dataframe)

browsers: [nan 'samsung' 'safari' 'chrome' 'edge' 'firefox' 'ie' 'other' 'opera'
 'aol' 'silk' 'waterfox' 'puffin' 'cyberfox' 'palemoon' 'maxthon' 'iron'
 'seamonkey' 'comodo' 'chromium' 'icedragon']
Mem. usage decreased to 357.35 Mb (6.6% reduction)


## train model

In [34]:
classifier = train_lgbm(dataframe)
joblib.dump(classifier, model_dir_path + 'day_hour.joblib')  # save model

Training until validation scores don't improve for 500 rounds
[1000]	training's auc: 0.991803	valid_1's auc: 0.969626
[2000]	training's auc: 0.998717	valid_1's auc: 0.976773
[3000]	training's auc: 0.99975	valid_1's auc: 0.978331
[4000]	training's auc: 0.999966	valid_1's auc: 0.978793
[5000]	training's auc: 0.999995	valid_1's auc: 0.978891
Early stopping, best iteration is:
[4688]	training's auc: 0.999991	valid_1's auc: 0.978911
AUC: 0.9789106212510266


['models/fe/day_hour.joblib']

## load test data

In [35]:
%%time

dataframe, transaction_id_data = load_test_data()
dataframe.head()

Mem. usage decreased to 472.59 Mb (68.9% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)
number of rows in test data: 506691
CPU times: user 43.5 s, sys: 29.6 s, total: 1min 13s
Wall time: 1min 13s


Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,18403224,31.953125,W,10409,111.0,150.0,visa,226.0,debit,170.0,...,,,,,,,,,,
1,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,299.0,...,,,,,,,,,,
2,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,472.0,...,,,,,,,,,,
3,18403310,285.0,W,10989,360.0,150.0,visa,166.0,debit,205.0,...,,,,,,,,,,
4,18403317,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,...,,,,,,,,,,


## extract day and hour from TransactionDT

In [36]:
dataframe['day'] = make_day_feature(dataframe, offset=0.58)
dataframe['hour'] = make_hour_feature(dataframe)
dataframe = dataframe.drop('TransactionDT', axis=1)

dataframe = preprocess(dataframe)

browsers: [nan 'chrome' 'ie' 'safari' 'edge' 'firefox' 'samsung' 'other' 'opera'
 'palemoon']
Mem. usage decreased to 315.73 Mb (6.4% reduction)


## do inference and get output

In [37]:
%%time

classifier = joblib.load(model_dir_path + 'day_hour.joblib')  # load model

output_dataframe = inference(classifier, dataframe, transaction_id_data)
output_dataframe.to_csv(output_dir_path + 'day_hour.csv', index=False)

output_dataframe.head()

CPU times: user 34min 26s, sys: 3.67 s, total: 34min 30s
Wall time: 3min 16s


Unnamed: 0,TransactionID,isFraud
0,3663549,0.000437
1,3663550,9.3e-05
2,3663551,0.000327
3,3663552,0.000354
4,3663553,0.000297


## AUC (extract day and hour from TransactionDT)

- validation: 0.9789106212510266
- Kaggle public score: 0.939177
- Kaggle private score: 0.912190

## comparison

AUC|TransactionDT|day|hour|day and hour
-|-|-|-|-
validation|0.978945247092173|0.9779124794073768|0.9729030850127821|0.9789106212510266
Kaggle public score|0.940459|0.938442|0.937576|0.939177
Kaggle private score|0.914248|0.910745|0.911587|0.912190

conclusion: using TransactionDT as it is, instead of extracting day and/or hour, is still the best for performance