In [1]:
# Data processing  
# -----------------------------------------------------------------------  
import pandas as pd  
import numpy as np

# Pandas options  
# -----------------------------------------------------------------------  
pd.options.display.max_colwidth = None

# Path configuration for custom module imports  
# -----------------------------------------------------------------------  
import sys  
sys.path.append('../')  # Adds the parent directory to the path for custom module imports  

# Ignore warnings  
# -----------------------------------------------------------------------  
import warnings  
warnings.filterwarnings("ignore") 

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from category_encoders import TargetEncoder

# Custom functions
# -----------------------------------------------------------------------

from src.support_preprocess import chi2_test, scale_df

In [2]:
df = pd.read_csv('../data/output/financial_data.csv', index_col=0).reset_index(drop=True)

In [3]:
df.head()

Unnamed: 0,customer_id,merchant_id,amount,is_fraudulent,card_type,location,purchase_category,customer_age
0,1082,2027,5758.59,0,MasterCard,City-30,Gas Station,43
1,1015,2053,1901.56,1,Visa,City-47,Online Shopping,61
2,1004,2035,1248.86,1,MasterCard,City-6,Gas Station,57
3,1095,2037,7619.05,1,Discover,City-6,Travel,59
4,1036,2083,1890.1,1,MasterCard,City-34,Retail,36


In [4]:
df["merchant_id"] = df["merchant_id"].astype("category")
df["customer_id"] = df["customer_id"].astype("category")

In [5]:
target_variable = 'is_fraudulent'
categories = df.select_dtypes(include=['O', 'category']).columns.to_list()

In [6]:
chi2_test(df, categories, target_variable, alpha = 0.05)

We are evaluating the variable CUSTOMER_ID


is_fraudulent,0,1
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,44,42
1002,45,45
1003,37,44
1004,36,51
1005,47,45
...,...,...
1096,46,52
1097,45,50
1098,46,58
1099,60,56


For the category CUSTOMER_ID there are NO significant differences, p = 0.8013

--------------------------
We are evaluating the variable MERCHANT_ID


is_fraudulent,0,1
merchant_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,45,42
2002,59,52
2003,46,54
2004,49,52
2005,38,51
...,...,...
2096,59,48
2097,53,46
2098,44,52
2099,58,55


For the category MERCHANT_ID there are NO significant differences, p = 0.6912

--------------------------
We are evaluating the variable CARD_TYPE


is_fraudulent,0,1
card_type,Unnamed: 1_level_1,Unnamed: 2_level_1
American Express,1262,1232
Discover,1304,1329
MasterCard,1140,1243
Visa,1226,1264


For the category CARD_TYPE there are NO significant differences, p = 0.2861

--------------------------
We are evaluating the variable LOCATION


is_fraudulent,0,1
location,Unnamed: 1_level_1,Unnamed: 2_level_1
City-1,80,108
City-10,100,94
City-11,92,107
City-12,102,107
City-13,99,110
City-14,106,112
City-15,96,88
City-16,92,94
City-17,109,96
City-18,89,105


For the category LOCATION there are NO significant differences, p = 0.1815

--------------------------
We are evaluating the variable PURCHASE_CATEGORY


is_fraudulent,0,1
purchase_category,Unnamed: 1_level_1,Unnamed: 2_level_1
Gas Station,792,874
Groceries,796,896
Online Shopping,847,804
Restaurant,851,785
Retail,808,853
Travel,838,856


For the category PURCHASE_CATEGORY there are significant differences, p = 0.0208


is_fraudulent,0,1
purchase_category,Unnamed: 1_level_1,Unnamed: 2_level_1
Gas Station,822.0,844.0
Groceries,834.0,858.0
Online Shopping,814.0,837.0
Restaurant,807.0,829.0
Retail,819.0,842.0
Travel,835.0,859.0


--------------------------


---

### Encoding

For the variable `card_type`, there is no specific order, and we only have 4 categories, so applying `OneHotEncoding` is reasonable.

For the categorical ID variables, even though they do not show significant differences, we can perform `TargetEncoding` since we might be interested in using the average of `is_fraudulent`.

We could also remove these columns if we find they do not add value to the predictive model, although, for now, we will keep them.

In [7]:
onehot = OneHotEncoder()
trans_one_hot = onehot.fit_transform(df[["card_type"]])
oh_df = pd.DataFrame(trans_one_hot.toarray(), columns=onehot.get_feature_names_out()).reset_index(drop=True)

In [8]:
target_encoder = TargetEncoder()
encoded_cols_df = target_encoder.fit_transform(df.drop(columns = ["is_fraudulent", "card_type"]), df["is_fraudulent"]).reset_index(drop=True)

In [9]:
df_encoded = pd.concat([df["is_fraudulent"], encoded_cols_df, oh_df], axis = 1)

Since the variable `is_fraudulent` is either 0 or 1, we will standardize the rest of the variables to a 0-1 scale, as the encoding we have used is either target encoding based on the response variable or OneHotEncoder. For this, we use a min-max scaler.

In [10]:
df_scaled, scaler = scale_df(df_encoded, df_encoded.columns.drop("is_fraudulent"), method = "minmax")
df_scaled = pd.concat([df_encoded["is_fraudulent"].reset_index(drop=True), df_scaled.reset_index(drop=True)], axis = 1)

In [11]:
df_scaled.to_csv('../data/output/financial_data_preprocessed.csv')