In [1]:
!pwd

/content


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd '/content/drive/MyDrive/Colab Notebooks/fraud_detection_project/notebooks'

/content/drive/MyDrive/Colab Notebooks/fraud_detection_project/notebooks


In [4]:
# Install and import mlflow
!pip install mlflow
import mlflow

# Import clear_output
from IPython.display import clear_output
clear_output()

In [5]:
# Enable autologging
mlflow.sklearn.autolog()

In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split

# Hide warning
import warnings
warnings.filterwarnings('ignore')

import os, sys
# Add the 'src' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

# Custom class
from model_training import ModelPipeline

# Configure logging
from logger import SetupLogger
# Assuming this class is defined in src/
from data_preprocessing import LoadData

logger = SetupLogger(log_file='../logs/model_training.log').get_logger()

In [7]:
# Initialize the LoadData class
fraud_data_init = LoadData(filepath='../data/processed/processed_fraud_data.csv', logger=logger)
creditcard_data_init = LoadData(filepath='../data/raw/creditcard.csv', logger=logger)


# Load the datasets
fraud_data = fraud_data_init.load_dataset().set_index('user_id')
credit_data = creditcard_data_init.load_dataset()

# Display the first few rows of both datasets
print("Fraud_Data.csv Head:")
print("=========================")
display(fraud_data.head())

print("\nCreditcard.csv Head:")
print("=========================")
display(credit_data.head())

INFO:logger:Dataset loaded successfully.
INFO:logger:Dataset loaded successfully.


Fraud_Data.csv Head:


Unnamed: 0_level_0,Unnamed: 0,signup_time,purchase_time,purchase_value,device_id,age,ip_address,class,hour_of_day,day_of_week,...,user_transaction_frequency,device_transaction_frequency,user_transaction_velocity,source_Direct,source_SEO,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_M
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22058,0,2015-02-24 22:55:49,2015-04-18 02:47:11,-0.160204,QVPSPJUOCKZAR,0.679914,732758400.0,0,-1.377455,0.99102,...,0.0,-0.261514,-0.230128,False,True,False,False,False,False,True
333320,1,2015-06-07 20:39:50,2015-06-08 01:38:54,-1.142592,EOGFQPIZPYXFZ,2.304476,350311400.0,0,-1.522122,-1.501259,...,0.0,-0.261514,-0.229874,False,False,False,False,False,False,False
1359,2,2015-01-01 18:52:44,2015-01-01 18:52:45,-1.197169,YSSKYOSJHPPLJ,2.304476,2621474000.0,1,0.937208,-0.005891,...,0.0,3.941861,4.345476,False,True,False,False,True,False,True
150084,3,2015-04-28 21:13:25,2015-05-04 13:54:50,0.385567,ATGTXKYKUDUQN,0.911994,3840542000.0,0,0.213876,-1.501259,...,0.0,-0.261514,-0.23012,False,True,False,False,False,True,True
221365,4,2015-07-21 07:09:52,2015-09-09 18:40:53,0.112681,NAUITBZFJKHWW,1.376155,415583100.0,0,0.937208,-0.504347,...,0.0,-0.261514,-0.230128,False,False,False,False,False,True,True



Creditcard.csv Head:


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [8]:
# Check for missing values
fraud_data.isnull().sum().sum()

0

In [9]:
credit_data.isnull().sum().sum()


0

In [10]:
# Convert signup_time and purchase_time to datetime
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

# Feature engineering: Calculate the time difference between signup and purchase
fraud_data['time_diff'] = (fraud_data['purchase_time'] - fraud_data['signup_time']).dt.total_seconds()

# Drop unnecessary columns
fraud_data = fraud_data.drop(columns=['Unnamed: 0', 'signup_time', 'purchase_time', 'device_id', 'ip_address'])


In [11]:
# Define target and features
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']

X_creditcard = credit_data.drop(columns=['Class'])
y_creditcard = credit_data['Class']

In [12]:
# Split for Fraud_Data.csv
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

# Split for creditcard.csv
X_creditcard_train, X_creditcard_test, y_creditcard_train, y_creditcard_test = train_test_split(X_creditcard, y_creditcard, test_size=0.2, random_state=42)

# Check the shape of the splits (optional)
print(f"Fraud_Data Train/Test shapes: {X_fraud_train.shape}, {X_fraud_test.shape}")
print(f"Creditcard Train/Test shapes: {X_creditcard_train.shape}, {X_creditcard_test.shape}")


Fraud_Data Train/Test shapes: (120889, 16), (30223, 16)
Creditcard Train/Test shapes: (227845, 30), (56962, 30)


In [13]:
# Model Training for Fraud_Data.csv
# Create the pipeline for the Fraud_Data dataset
fraud_pipeline = ModelPipeline(X_train=X_fraud_train, X_test=X_fraud_test, y_train=y_fraud_train, y_test=y_fraud_test)

# Train and evaluate models
best_fraud_model, best_fraud_model_name = fraud_pipeline.train_and_evaluate()

# Save the best model
fraud_pipeline.save_best_model(best_fraud_model, best_fraud_model_name, dataset_name='fraud_data')

Hyperparameter tuning for Random Forest...


2024/10/23 13:48:42 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8f8387c8ea06423aa4f307e3a56f014b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/10/23 13:51:02 INFO mlflow.sklearn.utils: Logging the 5 best runs, one run will be omitted.
2024/10/23 13:51:05 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd6283ed97dd8449faca043142deb0328', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Random Forest best parameters: {'classifier__max_depth': 5, 'classifier__n_estimators': 50}
Hyperparameter tuning for Gradient Boosting...


2024/10/23 13:53:38 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


Gradient Boosting best parameters: {'classifier__learning_rate': 0.01, 'classifier__n_estimators': 100}
Random Forest training time: 13.22 seconds


Registered model 'random_forest' already exists. Creating a new version of this model...
Created version '5' of model 'random_forest'.


Random Forest model logged in MLflow
Gradient Boosting training time: 33.15 seconds


Registered model 'gradient_boosting' already exists. Creating a new version of this model...
Created version '5' of model 'gradient_boosting'.


Gradient Boosting model logged in MLflow
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step




LSTM training time: 106.28 seconds


Registered model 'lstm' already exists. Creating a new version of this model...
Created version '4' of model 'lstm'.


LSTM model logged in MLflow
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step




CNN training time: 42.11 seconds


Registered model 'cnn' already exists. Creating a new version of this model...


CNN model logged in MLflow
Random Forest model saved as random_forest_fraud_data_best_model.pkl


Created version '4' of model 'cnn'.


In [14]:
# Model Training for creditcard.csv
# Create the pipeline for the Credit Card Fraud dataset
creditcard_pipeline = ModelPipeline(X_train=X_creditcard_train, X_test=X_creditcard_test, y_train=y_creditcard_train, y_test=y_creditcard_test)

# Train and evaluate models
best_creditcard_model, best_creditcard_model_name = creditcard_pipeline.train_and_evaluate()

# Save the best model
creditcard_pipeline.save_best_model(best_creditcard_model, best_creditcard_model_name, dataset_name='creditcard')

Hyperparameter tuning for Random Forest...


2024/10/23 13:57:23 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '78eb23890c714da0b297937578a122d8', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/10/23 14:20:05 INFO mlflow.sklearn.utils: Logging the 5 best runs, one run will be omitted.
2024/10/23 14:20:08 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f458c7a9aff74d9ab0b01399995cbd2a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Random Forest best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 100}
Hyperparameter tuning for Gradient Boosting...


2024/10/23 14:58:54 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


Gradient Boosting best parameters: {'classifier__learning_rate': 0.01, 'classifier__n_estimators': 100}
Random Forest training time: 276.85 seconds


Registered model 'random_forest' already exists. Creating a new version of this model...
Created version '6' of model 'random_forest'.


Random Forest model logged in MLflow
Gradient Boosting training time: 424.11 seconds


Registered model 'gradient_boosting' already exists. Creating a new version of this model...
Created version '6' of model 'gradient_boosting'.


Gradient Boosting model logged in MLflow
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step




LSTM training time: 211.63 seconds


Registered model 'lstm' already exists. Creating a new version of this model...
Created version '5' of model 'lstm'.


LSTM model logged in MLflow
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step




CNN training time: 89.92 seconds


Registered model 'cnn' already exists. Creating a new version of this model...


CNN model logged in MLflow
LSTM model saved as lstm_creditcard_best_model.pkl


Created version '5' of model 'cnn'.


In [18]:
# Review evaluation metrics for both datasets

# Fraud_Data.csv results
fraud_metrics, fraud_probs = fraud_pipeline.get_results()
print("Fraud_Data.csv Model Metrics:")
fraud = pd.DataFrame(fraud_metrics)
fraud.head()

Fraud_Data.csv Model Metrics:


Unnamed: 0,Random Forest,Gradient Boosting,LSTM,CNN
Accuracy,0.956424,0.956424,0.956424,0.956159
Precision,1.0,1.0,1.0,0.991618
Recall,0.537895,0.537895,0.537895,0.539649
F1 Score,0.699521,0.699521,0.699521,0.698932
ROC AUC,0.849597,0.849551,0.77522,0.769936


In [19]:
# creditcard.csv results
creditcard_metrics, creditcard_probs = creditcard_pipeline.get_results()
print("\ncreditcard.csv Model Metrics:")


credit = pd.DataFrame(creditcard_metrics)
credit.head()



creditcard.csv Model Metrics:


Unnamed: 0,Random Forest,Gradient Boosting,LSTM,CNN
Accuracy,0.999579,0.999403,0.999403,0.99928
Precision,0.9625,0.984848,0.840426,0.890411
Recall,0.785714,0.663265,0.806122,0.663265
F1 Score,0.865169,0.792683,0.822917,0.760234
ROC AUC,0.962846,0.928369,0.975744,0.905492
