## Scaling & Encoding Data (Strategy 1)

- Encoding MCCs each as own column

In [1]:
import pandas as pd

df = pd.read_parquet("../0 - Data/3 - featured/ft_strategy_1.pq")
df.head()

Unnamed: 0,Amount,Use Chip,MCC,Card Type,Has Chip,Cards Issued,Credit Limit,Current Age,Retirement Age,Gender,...,AmountToCreditLimitRatio,IncomeToSpendingRatioZip,IncomeToSpendingRatioPerson,DebtToIncomeRatio,CardUsageRatio,YearsToRetirement,Account Age (Days),Age Group,Is Retired,Bad PIN Error
6780,123.97,Swipe Transaction,5541,Debit,1,2,24295.0,53,66,Female,...,0.005103,236.170041,481.535855,2.137714,2.5,13,5601,46-60,False,False
6781,123.97,Swipe Transaction,5541,Debit,1,2,21968.0,53,66,Female,...,0.005643,236.170041,481.535855,2.137714,2.5,13,1371,46-60,False,False
6782,123.97,Swipe Transaction,5541,Debit,1,2,46414.0,53,66,Female,...,0.002671,236.170041,481.535855,2.137714,2.5,13,5298,46-60,False,False
6783,123.97,Swipe Transaction,5541,Credit,0,1,12400.0,53,66,Female,...,0.009998,236.170041,481.535855,2.137714,5.0,13,5479,46-60,False,False
6784,123.97,Swipe Transaction,5541,Debit (Prepaid),1,1,28.0,53,66,Female,...,4.4275,236.170041,481.535855,2.137714,5.0,13,3409,46-60,False,False


#### Find columns that needs to be encoded

In [2]:
columns_to_encode = df.select_dtypes(include=['category', 'object']).columns

print("Columns that need encoding:", columns_to_encode)

Columns that need encoding: Index(['Use Chip', 'Card Type', 'Gender', 'Age Group'], dtype='object')


#### One-Hot Encoding Categorical Features

In [3]:
df = pd.get_dummies(df, columns=columns_to_encode, drop_first=True)
df.head(2)

Unnamed: 0,Amount,MCC,Has Chip,Cards Issued,Credit Limit,Current Age,Retirement Age,Latitude,Longitude,Per Capita Income - Zipcode,...,Bad PIN Error,Use Chip_Online Transaction,Use Chip_Swipe Transaction,Card Type_Debit,Card Type_Debit (Prepaid),Gender_Male,Age Group_26-35,Age Group_36-45,Age Group_46-60,Age Group_60+
6780,123.97,5541,1,2,24295.0,53,66,34.15,-117.76,29278.0,...,False,False,True,True,False,False,False,False,True,False
6781,123.97,5541,1,2,21968.0,53,66,34.15,-117.76,29278.0,...,False,False,True,True,False,False,False,False,True,False


#### One Hot Encode MCC

In [4]:
df = pd.get_dummies(df, columns=["MCC"], drop_first=True)
df.head(2)

Unnamed: 0,Amount,Has Chip,Cards Issued,Credit Limit,Current Age,Retirement Age,Latitude,Longitude,Per Capita Income - Zipcode,Yearly Income - Person,...,MCC_8011,MCC_8021,MCC_8041,MCC_8043,MCC_8049,MCC_8062,MCC_8099,MCC_8111,MCC_8931,MCC_9402
6780,123.97,1,2,24295.0,53,66,34.15,-117.76,29278.0,59696.0,...,False,False,False,False,False,False,False,False,False,False
6781,123.97,1,2,21968.0,53,66,34.15,-117.76,29278.0,59696.0,...,False,False,False,False,False,False,False,False,False,False


#### Check for any Accidental Infinity in Extracted Features

In [5]:
import numpy as np

numeric_columns = df.select_dtypes(include=['number']).columns

unsafe_cols = []
for column in numeric_columns:
    maximum = df[column].max()
    if not maximum < np.inf: unsafe_cols.append(column)

print(f"Unsafe Columns: {unsafe_cols}")

Unsafe Columns: []


In [6]:
# Handle unsafe values if any

#### Scale

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Save the target variable (We dont want to scale this one)
target = df["Is Fraud"]
numeric_columns = numeric_columns.drop("Is Fraud")

# Scale the data
df_without_target = df.drop(["Is Fraud"], axis=1)
df_without_target[numeric_columns] = scaler.fit_transform(df_without_target[numeric_columns])

# Append the target column back
df_scaled = df_without_target.copy()
df_scaled["Is Fraud"] = target

df_scaled.head()

MemoryError: Unable to allocate 2.60 GiB for an array with shape (26, 13412376) and data type float64

#### Append Non Scaling Values

In [8]:
# Append the target and datetime columns back
df_scaled["Is Fraud"] = target.values

df_scaled.head()

Unnamed: 0,Amount,Has Chip,Cards Issued,Credit Limit,Current Age,Retirement Age,Latitude,Longitude,Per Capita Income - Zipcode,Yearly Income - Person,...,MCC_8021,MCC_8041,MCC_8043,MCC_8049,MCC_8062,MCC_8099,MCC_8111,MCC_8931,MCC_9402,Is Fraud
6780,1.025128,0.337145,0.963448,0.738314,-0.09811,-0.185577,-0.631234,-1.563208,0.447753,0.559483,...,False,False,False,False,False,False,False,False,False,0
6781,1.025128,0.337145,0.963448,0.551106,-0.09811,-0.185577,-0.631234,-1.563208,0.447753,0.559483,...,False,False,False,False,False,False,False,False,False,0
6782,1.025128,0.337145,0.963448,2.517794,-0.09811,-0.185577,-0.631234,-1.563208,0.447753,0.559483,...,False,False,False,False,False,False,False,False,False,0
6783,1.025128,-2.966079,-0.967922,-0.218642,-0.09811,-0.185577,-0.631234,-1.563208,0.447753,0.559483,...,False,False,False,False,False,False,False,False,False,0
6784,1.025128,0.337145,-0.967922,-1.213973,-0.09811,-0.185577,-0.631234,-1.563208,0.447753,0.559483,...,False,False,False,False,False,False,False,False,False,0


#### Save the Scaled Data

In [1]:
import os

clean_dir = "../0 - Data/4 - scaled"
if not os.path.exists(clean_dir):
    os.makedirs(clean_dir)

df_scaled.to_parquet("../0 - Data/4 - scaled/ft_strategy_1_scaled.pq")

NameError: name 'df_scaled' is not defined