### Add the scripts to the notebooks path

In [24]:
import os, sys

current_dir = os.getcwd()
print(current_dir)

# Get the parent directory
parent_dir = os.path.dirname(current_dir)

scripts_path = os.path.join(parent_dir, 'scripts')

# Insert the path to the parent directory
sys.path.insert(0, parent_dir)

# Insert the path to the Scripts directory
sys.path.insert(0, scripts_path)

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

d:\KifiyaAIM-Course\Week - 8&9\Adey-Innovations-Fraud-Detection\notebooks


### Import Statements

In [25]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [26]:
sns.set_theme()

### Load the Data

In [27]:
# define path to the csv files
COUNTRY_IP_DATA = "../data/IpAddress_to_Country.csv"
FRAUD_DATA = "../data/Fraud_Data.csv"
CREDIT_DATA = "../data/creditcard.csv"

# load the data into pandas dataframes
country_ip_mapping = pd.read_csv(COUNTRY_IP_DATA)
fraud_data = pd.read_csv(FRAUD_DATA)
credit_data = pd.read_csv(CREDIT_DATA)

### Preprocess data

In [28]:
from scripts.feature_engineering import FeatureEngineering

1) First pass the data through the feature egineering pipeline. It contains the following processes.

merege ip and fraud data -> calculate transaction velocity and frequency -> break down date features -> handle missing values

In [29]:
data = FeatureEngineering.feature_enginering_pipeline(data=fraud_data, ip_mapping=country_ip_mapping)

2. Split the data into training and testing sets

In [30]:
from sklearn.model_selection import train_test_split
from scripts.utils import load_pickle, pickle_object

In [31]:
# the root path where to export/load pickeled objects from
EXPORT_PATH_ROOT = "../feature_store" 

First for the credit card data

In [32]:
# define features
credit_card_features = [column for column in credit_data.columns if column not in ["Class"]]

# define target 
credit_card_target = "Class"

# save the feature into pickle file
feature_save_path = os.path.join(EXPORT_PATH_ROOT, 'credit_features.pkl')
pickle_object(file_path=feature_save_path, object=credit_card_features)

# save the target into a pickle file
target_save_path = os.path.join(EXPORT_PATH_ROOT, 'credit_target.pkl')
pickle_object(file_path=target_save_path, object=credit_card_target)

In [33]:
# obtain the features and targets
credit_X = credit_data[credit_card_features]
credit_y = credit_data[credit_card_target]

# split them into training and testing features
credit_train, credit_test, credit_y_train, credit_y_test = train_test_split(credit_X, credit_y, test_size=0.3, random_state=7)

Now for the fraud data

In [34]:
# define features
fraud_features = [column for column in data.columns if column not in ["class", "user_id", "device_id"]]

# define targets
fraud_targets = "class"

# save the feature into pickle file
feature_save_path = os.path.join(EXPORT_PATH_ROOT, 'fraud_features.pkl')
pickle_object(file_path=feature_save_path, object=credit_card_features)

# save the target into a pickle file
target_save_path = os.path.join(EXPORT_PATH_ROOT, 'fraud_target.pkl')
pickle_object(file_path=target_save_path, object=credit_card_target)

In [35]:
# obtain the features and targets
fraud_X = data[fraud_features]
fraud_y = data[fraud_targets]

# split them into training and testing features
fraud_train, fraud_test, fraud_y_train, fraud_y_test = train_test_split(fraud_X, fraud_y, test_size=0.3, random_state=7)

3. Normalize the numerical features

In [36]:
# the root path to store scalers and numerical encoders in
ENCODERS_PATH_ROOT = "../scalers"

First normalize the credit  scoring numerical features

In [37]:
# normalize the numerical features in the credit card data using the training data and save the scaler to be used during inference
credit_train, credit_numerical_scaler = FeatureEngineering.normalize_numerical_features(data=credit_train)
credit_test = credit_numerical_scaler.transform(X=credit_test)

# save the scaler
pickle_object(file_path=os.path.join(ENCODERS_PATH_ROOT, 'credit_scaler.pkl'), object=credit_numerical_scaler)

Now normalize the fraud numerical features

In [38]:
# normalize the numerical features in the fraud data using training data and save the scaler to be used during inference
fraud_train_numeric_scaled, fraud_numerical_scaler = FeatureEngineering.normalize_numerical_features(data=fraud_train)
scaled_columns = fraud_numerical_scaler.get_feature_names_out()
fraud_test_numeric_scaled = fraud_numerical_scaler.transform(X=fraud_test[scaled_columns])

# replace the numerical columns with the scaled ones
fraud_train[scaled_columns] = fraud_train_numeric_scaled[scaled_columns]
fraud_test[scaled_columns] = fraud_test_numeric_scaled

# save the numeric columns that are scaled into pickle files
pickle_object(file_path=os.path.join(EXPORT_PATH_ROOT, 'scaled_numerical_features.pkl'), object=scaled_columns)

# save the scaler
pickle_object(file_path=os.path.join(ENCODERS_PATH_ROOT, 'fraud_scaler.pkl'), object=fraud_numerical_scaler)

4. Now encode the categorical features

In [39]:
from scripts.utils import use_label_encoder

The credit data is skipped because all of its values are numerical data. So no need to encode it. 
Only the fraud data is going to have to categorical columns to be encoded

In [40]:
# encode categorical features using data from training, obtain the encoders for each categorical columns
fraud_train, categorical_encoder = FeatureEngineering.encode_categorical_data(data=fraud_train)

# encode categorical features of the testing data
for categorical_column in categorical_encoder:
    fraud_test[categorical_column] = use_label_encoder(data=fraud_test[categorical_column], encoder=categorical_encoder[categorical_column])

# save the categorical column encoders in pickle files
pickle_object(file_path=os.path.join(EXPORT_PATH_ROOT, 'categorical_encoder.pkl'), object=categorical_encoder)