## Scaling & Encoding Data

In [1]:
import pandas as pd

df = pd.read_parquet("../0 - Data/featured_transactions.pq")
df.head()

Unnamed: 0,Amount,Use Chip,MCC,Card Brand,Card Type,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,...,Transaction Hour,Transaction Day,Transaction Month,Transaction Day of Week,Is Weekend,Account Age (Days),Debt to Income Ratio,Credit Utilization,Age Group,Is Retired
0,130.95,Chip Transaction,5912,Visa,Debit,1,2,24295.0,2002-09-01,2008,...,6,2,1,1,0,5602,2.137714,0.00539,46-60,False
1,130.95,Chip Transaction,5912,Visa,Debit,1,2,21968.0,2014-04-01,2014,...,6,2,1,1,0,1372,2.137714,0.005961,46-60,False
2,130.95,Chip Transaction,5912,Visa,Debit,1,2,46414.0,2003-07-01,2004,...,6,2,1,1,0,5299,2.137714,0.002821,46-60,False
3,130.95,Chip Transaction,5912,Visa,Credit,0,1,12400.0,2003-01-01,2012,...,6,2,1,1,0,5480,2.137714,0.01056,46-60,False
4,130.95,Chip Transaction,5912,Mastercard,Debit (Prepaid),1,1,28.0,2008-09-01,2009,...,6,2,1,1,0,3410,2.137714,4.676786,46-60,False


#### Drop any Left Over Columns

In [2]:
datetime_columns = df.select_dtypes(include=['datetime']).columns
string_columns = df.dtypes[df.dtypes == 'object'].index

print("Datetime columns:", datetime_columns)
print("String columns:", string_columns)

Datetime columns: Index(['Acct Open Date', 'Datetime'], dtype='object')
String columns: Index(['Use Chip', 'Card Brand', 'Card Type', 'Gender'], dtype='object')


In [3]:
df = df.drop(["Acct Open Date"], axis=1)
df.head()

Unnamed: 0,Amount,Use Chip,MCC,Card Brand,Card Type,Has Chip,Cards Issued,Credit Limit,Year PIN last Changed,Current Age,...,Transaction Hour,Transaction Day,Transaction Month,Transaction Day of Week,Is Weekend,Account Age (Days),Debt to Income Ratio,Credit Utilization,Age Group,Is Retired
0,130.95,Chip Transaction,5912,Visa,Debit,1,2,24295.0,2008,53,...,6,2,1,1,0,5602,2.137714,0.00539,46-60,False
1,130.95,Chip Transaction,5912,Visa,Debit,1,2,21968.0,2014,53,...,6,2,1,1,0,1372,2.137714,0.005961,46-60,False
2,130.95,Chip Transaction,5912,Visa,Debit,1,2,46414.0,2004,53,...,6,2,1,1,0,5299,2.137714,0.002821,46-60,False
3,130.95,Chip Transaction,5912,Visa,Credit,0,1,12400.0,2012,53,...,6,2,1,1,0,5480,2.137714,0.01056,46-60,False
4,130.95,Chip Transaction,5912,Mastercard,Debit (Prepaid),1,1,28.0,2009,53,...,6,2,1,1,0,3410,2.137714,4.676786,46-60,False


#### One-Hot Encoding Categorical Features

In [4]:
df = pd.get_dummies(df, columns=['Card Brand', 'Card Type', 'Gender', 'Age Group', 'Use Chip'], drop_first=True)
df.head()

Unnamed: 0,Amount,MCC,Has Chip,Cards Issued,Credit Limit,Year PIN last Changed,Current Age,Retirement Age,Birth Year,Birth Month,...,Card Brand_Visa,Card Type_Debit,Card Type_Debit (Prepaid),Gender_Male,Age Group_26-35,Age Group_36-45,Age Group_46-60,Age Group_60+,Use Chip_Online Transaction,Use Chip_Swipe Transaction
0,130.95,5912,1,2,24295.0,2008,53,66,1966,11,...,True,True,False,False,False,False,True,False,False,False
1,130.95,5912,1,2,21968.0,2014,53,66,1966,11,...,True,True,False,False,False,False,True,False,False,False
2,130.95,5912,1,2,46414.0,2004,53,66,1966,11,...,True,True,False,False,False,False,True,False,False,False
3,130.95,5912,0,1,12400.0,2012,53,66,1966,11,...,True,False,False,False,False,False,True,False,False,False
4,130.95,5912,1,1,28.0,2009,53,66,1966,11,...,False,False,True,False,False,False,True,False,False,False


#### Check for any Accidental Infinity in Extracted Features

In [5]:
import numpy as np

numeric_columns = df.select_dtypes(include=['number']).columns

unsafe_cols = []
for column in numeric_columns:
    maximum = df[column].max()
    if not maximum < np.inf: unsafe_cols.append(column)

print(f"Unsafe Columns: {unsafe_cols}")

Unsafe Columns: ['Credit Utilization']


In [6]:
# Handle unsafe values
df["Credit Utilization"] = df["Credit Utilization"].replace([np.inf, -np.inf], 1)

#### Scaling

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Save the target variable (We dont want to scale this one)
target = df["Is Fraud"]
numeric_columns = numeric_columns.drop("Is Fraud")

# Scale the data
df_without_target = df.drop(["Is Fraud"], axis=1)
df_without_target[numeric_columns] = scaler.fit_transform(df_without_target[numeric_columns])

# Append the target column back
df_scaled = df_without_target.copy()
df_scaled["Is Fraud"] = target

df_scaled.head()

Unnamed: 0,Amount,MCC,Has Chip,Cards Issued,Credit Limit,Year PIN last Changed,Current Age,Retirement Age,Birth Year,Birth Month,...,Card Type_Debit,Card Type_Debit (Prepaid),Gender_Male,Age Group_26-35,Age Group_36-45,Age Group_46-60,Age Group_60+,Use Chip_Online Transaction,Use Chip_Swipe Transaction,Is Fraud
0,1.112768,0.389974,0.337145,0.963448,0.738314,-1.215831,-0.09811,-0.185577,0.087548,1.250378,...,True,False,False,False,False,True,False,False,False,0
1,1.112768,0.389974,0.337145,0.963448,0.551106,0.399826,-0.09811,-0.185577,0.087548,1.250378,...,True,False,False,False,False,True,False,False,False,0
2,1.112768,0.389974,0.337145,0.963448,2.517794,-2.292935,-0.09811,-0.185577,0.087548,1.250378,...,True,False,False,False,False,True,False,False,False,0
3,1.112768,0.389974,-2.966079,-0.967922,-0.218642,-0.138727,-0.09811,-0.185577,0.087548,1.250378,...,False,False,False,False,False,True,False,False,False,0
4,1.112768,0.389974,0.337145,-0.967922,-1.213973,-0.946555,-0.09811,-0.185577,0.087548,1.250378,...,False,True,False,False,False,True,False,False,False,0


#### Save the Scaled Data

In [8]:
df.to_parquet("../0 - Data/scaled_transactions.pq")