# Encoding

In [26]:
import os
import re
import sys
import warnings

import copy
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

warnings.filterwarnings('ignore')

# define functions

In [27]:
def ordinal_encode(df, df_test, testing = False):
#     df = df.copy()
    categories_dict = {}
    
    temp_merge = df.append(df_test)
    for cat in temp_merge.columns:
        if temp_merge[cat].dtypes == 'object':
            categories_dict[cat] = list(temp_merge[cat].unique())
            if testing:
                print("Numero de categorias para variavel '{}': {} ".format(cat,temp_merge[cat].unique().size))

    if testing:
        print()
        print(list(categories_dict.keys()))
        
    enc = OrdinalEncoder(categories=list(categories_dict.values()))
    trained_encoder = enc.fit(df[list(categories_dict.keys())])
    
    # transform train and test
    df[list(categories_dict.keys())] = trained_encoder.transform(df[list(categories_dict.keys())])
    df_test[list(categories_dict.keys())] = trained_encoder.transform(df_test[list(categories_dict.keys())])

    if testing:
        print(categories_dict)
    
    return df, df_test

def one_hot_encode(df):
    print('Quantity of columns before one-hot encoding:', len(df.columns))
    
    df_oldcols = df.columns.to_list()
    df = pd.get_dummies(df, prefix_sep='_', drop_first=True)
    
    print('Quantity of columns after one-hot encoding:', len(df.columns))
    
    # rename columns to show which are dummies
    onehot_cols = list(set(df.columns.to_list()) - set(df_oldcols))
    onehot_cols_renaming = {col: 'dummy_'+col.replace('-', '_') for col in onehot_cols}
    df.rename(columns = onehot_cols_renaming, inplace=True)
    
    return df

# Define paths and capture data

In [28]:
inputs = os.path.join('..', 'data', '02_intermediate')
outputs = os.path.join('..', 'data', '03_processed')
reports = os.path.join('..', 'data', '06_reporting')

ord_dict = {}
ord_dict['X_train'] = pd.read_csv(os.path.join(inputs, 'X_train.csv'), index_col='id')
ord_dict['X_test'] = pd.read_csv(os.path.join(inputs, 'X_test.csv'), index_col='id')

onehot_dict = copy.deepcopy(ord_dict)

y_train = pd.read_csv(os.path.join(inputs, 'y_train.csv'), index_col='id') 
y_test = pd.read_csv(os.path.join(inputs, 'y_test.csv'), index_col='id')

# Count categorical data unique values
Check both train and test. Any inconsistency between them should be addressed.

In [29]:
for data in ['X_train', 'X_test']:
    categories_dict = {}
    print('\r\nchecking number of categories for {}'. format(data))
    for cat in ord_dict[data].columns:
        if ord_dict[data][cat].dtypes == 'object':
            categories_dict[cat] = list(ord_dict[data][cat].unique())
            print("Numero de categorias para variavel '{}': {} ".format(cat, ord_dict[data][cat].unique().size))


checking number of categories for X_train
Numero de categorias para variavel 'productcd': 5 
Numero de categorias para variavel 'card4': 4 
Numero de categorias para variavel 'card6': 2 
Numero de categorias para variavel 'p_emaildomain': 54 
Numero de categorias para variavel 'm4': 3 

checking number of categories for X_test
Numero de categorias para variavel 'productcd': 5 
Numero de categorias para variavel 'card4': 4 
Numero de categorias para variavel 'card6': 3 
Numero de categorias para variavel 'p_emaildomain': 45 
Numero de categorias para variavel 'm4': 3 


# Ordinal Encoding

In [30]:
ord_dict['X_train']

Unnamed: 0_level_0,transactiondt,transactionamt,productcd,card1,card2,card3,card4,card5,card6,addr1,...,d1,d2,d3,d4,d5,d10,d11,d15,m4,if_anomaly
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3486774,13107389.0,38.056,C,9633.0,130.0,185.0,visa,138.0,debit,299.0,...,0.0,99.0,8.0,0.0,11.0,0.0,43.0,0.0,M2,1
3062695,1650884.0,150.000,R,15063.0,514.0,150.0,visa,226.0,credit,194.0,...,0.0,99.0,8.0,27.0,11.0,14.0,43.0,54.0,M0,1
3273443,7048761.0,56.500,W,9006.0,555.0,143.0,mastercard,224.0,debit,502.0,...,1.0,99.0,1.0,2.0,1.0,2.0,1.0,2.0,M0,1
3384445,10011292.0,8.459,C,11201.0,103.0,185.0,visa,226.0,debit,299.0,...,0.0,99.0,8.0,0.0,11.0,0.0,43.0,0.0,M2,1
3489059,13159069.0,77.950,W,7919.0,194.0,150.0,mastercard,166.0,debit,315.0,...,0.0,99.0,8.0,168.0,0.0,0.0,168.0,0.0,M0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3522222,14086521.0,1331.000,W,18268.0,583.0,150.0,visa,226.0,credit,181.0,...,525.0,525.0,1.0,524.0,1.0,524.0,0.0,236.0,M0,1
2995416,252748.0,30.000,W,3507.0,361.0,150.0,visa,226.0,credit,204.0,...,29.0,29.0,1.0,105.0,28.0,29.0,43.0,29.0,M0,1
3336090,8616048.0,16.767,C,18250.0,545.0,185.0,visa,226.0,credit,284.0,...,0.0,99.0,8.0,27.0,11.0,1.0,43.0,54.0,M0,1
3533234,14417117.0,82.950,W,10972.0,111.0,150.0,mastercard,224.0,debit,181.0,...,436.0,436.0,14.0,42.0,42.0,421.0,628.0,628.0,M0,1


In [31]:
ord_dict['X_train'], ord_dict['X_test'] = ordinal_encode(ord_dict['X_train'], ord_dict['X_test'], testing = False)

ord_dict['X_train'].head()

Unnamed: 0_level_0,transactiondt,transactionamt,productcd,card1,card2,card3,card4,card5,card6,addr1,...,d1,d2,d3,d4,d5,d10,d11,d15,m4,if_anomaly
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3486774,13107389.0,38.056,0.0,9633.0,130.0,185.0,0.0,138.0,0.0,299.0,...,0.0,99.0,8.0,0.0,11.0,0.0,43.0,0.0,0.0,1
3062695,1650884.0,150.0,1.0,15063.0,514.0,150.0,0.0,226.0,1.0,194.0,...,0.0,99.0,8.0,27.0,11.0,14.0,43.0,54.0,1.0,1
3273443,7048761.0,56.5,2.0,9006.0,555.0,143.0,1.0,224.0,0.0,502.0,...,1.0,99.0,1.0,2.0,1.0,2.0,1.0,2.0,1.0,1
3384445,10011292.0,8.459,0.0,11201.0,103.0,185.0,0.0,226.0,0.0,299.0,...,0.0,99.0,8.0,0.0,11.0,0.0,43.0,0.0,0.0,1
3489059,13159069.0,77.95,2.0,7919.0,194.0,150.0,1.0,166.0,0.0,315.0,...,0.0,99.0,8.0,168.0,0.0,0.0,168.0,0.0,1.0,1


# One-Hot Encoding

In [32]:
for df in ['X_train', 'X_test']:
    onehot_dict[df] = one_hot_encode(onehot_dict[df])
    
print('\r\nColumns of the new database:')
# print(onehot_dict[df].columns.to_list())

Quantity of columns before one-hot encoding: 37
Quantity of columns after one-hot encoding: 95
Quantity of columns before one-hot encoding: 37
Quantity of columns after one-hot encoding: 87

Columns of the new database:


# report new data types

### data alignment
if some category is missing on test set, we need to account for that and build corresponding column filled with 'zeros'.

In [33]:
def fill_missing_cols(smaller, greater):
    missing_cols = set( greater.columns ) - set( smaller.columns )
    for c in missing_cols:
        smaller[c] = 0
    
    return smaller

In [34]:
onehot_dict['X_train'] = fill_missing_cols(onehot_dict['X_train'], onehot_dict['X_test'])
onehot_dict['X_test'] = fill_missing_cols(onehot_dict['X_test'], onehot_dict['X_train'])

# align column positions (no data leakage here. Just altering column ordering.)
onehot_dict['X_train'], onehot_dict['X_test'] = onehot_dict['X_train'].align(onehot_dict['X_test'], axis=1)

# Save processed data

In [35]:
for df in ['X_train', 'X_test']:
    ord_dict[df].to_csv(os.path.join(outputs, df+'.csv'))
    onehot_dict[df].to_csv(os.path.join(outputs, df+'_onehot.csv'))
    
for df in ['X_train', 'X_test']:
    print(ord_dict[df].shape)
    print(onehot_dict[df].shape)
    
y_train.to_csv(os.path.join(outputs, 'y_train.csv'))
y_test.to_csv(os.path.join(outputs, 'y_test.csv'))

(7000, 37)
(7000, 99)
(3000, 37)
(3000, 99)


# save report over data types