In [1]:
%load_ext autoreload


In [2]:
%autoreload 2

In [3]:

# Import necessary libraries
import pandas as pd
import numpy as np

# Import custom scripts
import sys
sys.path.append('../scripts')  # Add the scripts directory to the path

from data_preparation import prepare_data
from model_selection import get_models, get_deep_learning_models
from model_training import train_and_evaluate_models, train_and_evaluate_deep_learning_models
from mlops_tracking import log_experiment


In [4]:

# Load preprocessed datasets
fraud_data = pd.read_csv('../data/Fraud_Data_Preprocessed.csv')
creditcard_data = pd.read_csv('../data/creditcard_Preprocessed.csv')


In [5]:

# 1. Data Preparation
# For Fraud_Data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = prepare_data(fraud_data, 'class')

# For creditcard_data
X_train_cc, X_test_cc, y_train_cc, y_test_cc = prepare_data(creditcard_data, 'Class')


In [6]:

# 2. Model Selection
# Machine Learning Models
models = get_models()

# Deep Learning Models
input_shape = (X_train_fraud.shape[1], 1)  # Adjust input shape for deep learning models
deep_learning_models = get_deep_learning_models(input_shape)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)


In [None]:
print(X_train_fraud.dtypes)

user_id                     int64
signup_time                object
purchase_time              object
purchase_value            float64
device_id                  object
age                       float64
ip_address                  int64
country                    object
lower_bound_ip_address    float64
upper_bound_ip_address    float64
hour_of_day                 int64
day_of_week                 int64
transaction_frequency       int64
transaction_velocity      float64
source_Direct                bool
source_SEO                   bool
browser_FireFox              bool
browser_IE                   bool
browser_Opera                bool
browser_Safari               bool
sex_M                        bool
dtype: object


In [9]:
X_train_fraud

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,age,ip_address,country,lower_bound_ip_address,upper_bound_ip_address,...,day_of_week,transaction_frequency,transaction_velocity,source_Direct,source_SEO,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_M
106227,79808,2015-05-06 10:49:16,2015-08-19 15:48:15,-0.760758,TPGKSTJCXSJEF,-1.640085,1929714375,Korea Republic of,1.929380e+09,1.930428e+09,...,2,1,,False,False,False,False,False,False,False
34398,233270,2015-05-21 08:58:30,2015-08-04 06:52:45,0.875952,VBSZHEPABPPJL,1.725927,2221594327,United States,2.221408e+09,2.221670e+09,...,1,1,,True,False,False,False,False,False,True
82929,20678,2015-01-02 03:01:09,2015-04-17 21:53:06,-0.378859,GSSIEQZVNKCTX,-0.363322,324729046,United States,3.187671e+08,3.355443e+08,...,4,1,,False,False,False,False,False,False,True
36069,231475,2015-01-05 11:20:18,2015-01-26 06:39:49,-1.033542,JMJUSBBQIPNTC,0.333095,24886496,Australia,2.464154e+07,2.516582e+07,...,0,1,,True,False,False,False,False,False,False
68747,146633,2015-01-04 20:12:04,2015-01-04 20:12:05,-1.033542,ETNWOLYCFABSH,1.377719,3405297090,Japan,3.404726e+09,3.405775e+09,...,6,1,,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104832,176748,2015-01-14 10:12:11,2015-04-15 21:32:59,-1.088099,MLJUFTYKOSMDW,-0.015114,357953575,United States,3.523215e+08,3.690988e+08,...,2,1,,False,False,True,False,False,False,False
7742,276787,2015-05-23 10:13:28,2015-06-14 15:53:48,1.803420,GJMQMLSCCADTZ,0.797372,2672655150,United States,2.672624e+09,2.672689e+09,...,6,1,,False,False,False,False,False,False,True
109038,376881,2015-02-03 21:44:14,2015-03-03 00:36:14,-0.051517,BREDAKOTPNHMO,-0.827599,1702430668,China,1.702363e+09,1.702625e+09,...,1,1,,True,False,False,False,False,True,False
119137,67438,2015-07-15 05:26:27,2015-10-26 13:50:49,1.312408,FWFIFZZGMQQRY,1.029511,3125250589,Ecuador,3.125150e+09,3.125281e+09,...,0,1,,False,False,False,False,False,False,True


In [7]:
# Convert datetime columns to datetime objects
X_train_fraud['signup_time'] = pd.to_datetime(X_train_fraud['signup_time'])
X_train_fraud['purchase_time'] = pd.to_datetime(X_train_fraud['purchase_time'])

# Extract useful features from datetime columns
X_train_fraud['signup_hour'] = X_train_fraud['signup_time'].dt.hour
X_train_fraud['signup_day'] = X_train_fraud['signup_time'].dt.dayofweek
X_train_fraud['purchase_hour'] = X_train_fraud['purchase_time'].dt.hour
X_train_fraud['purchase_day'] = X_train_fraud['purchase_time'].dt.dayofweek

# Drop the original datetime columns
X_train_fraud = X_train_fraud.drop(columns=['signup_time', 'purchase_time'])

# Convert boolean columns to integers (0 or 1)
boolean_columns = X_train_fraud.select_dtypes(include=['bool']).columns
X_train_fraud[boolean_columns] = X_train_fraud[boolean_columns].astype(int)

# Drop unnecessary columns
X_train_fraud = X_train_fraud.drop(columns=['device_id', 'country'])

# Fill missing values in 'transaction_velocity' with 0 (or the same value used in X_train_fraud)
X_train_fraud['transaction_velocity'] = X_train_fraud['transaction_velocity'].fillna(0)

# Verify the dataset
print(X_train_fraud.dtypes)


user_id                     int64
purchase_value            float64
age                       float64
ip_address                  int64
lower_bound_ip_address    float64
upper_bound_ip_address    float64
hour_of_day                 int64
day_of_week                 int64
transaction_frequency       int64
transaction_velocity      float64
source_Direct               int64
source_SEO                  int64
browser_FireFox             int64
browser_IE                  int64
browser_Opera               int64
browser_Safari              int64
sex_M                       int64
signup_hour                 int32
signup_day                  int32
purchase_hour               int32
purchase_day                int32
dtype: object


In [10]:

print(X_train_fraud.isnull().sum())

user_id                   0
purchase_value            0
age                       0
ip_address                0
lower_bound_ip_address    0
upper_bound_ip_address    0
hour_of_day               0
day_of_week               0
transaction_frequency     0
transaction_velocity      0
source_Direct             0
source_SEO                0
browser_FireFox           0
browser_IE                0
browser_Opera             0
browser_Safari            0
sex_M                     0
signup_hour               0
signup_day                0
purchase_hour             0
purchase_day              0
dtype: int64


In [8]:
# Convert datetime columns to datetime objects
X_test_fraud['signup_time'] = pd.to_datetime(X_test_fraud['signup_time'])
X_test_fraud['purchase_time'] = pd.to_datetime(X_test_fraud['purchase_time'])

# Extract useful features from datetime columns
X_test_fraud['signup_hour'] = X_test_fraud['signup_time'].dt.hour
X_test_fraud['signup_day'] = X_test_fraud['signup_time'].dt.dayofweek
X_test_fraud['purchase_hour'] = X_test_fraud['purchase_time'].dt.hour
X_test_fraud['purchase_day'] = X_test_fraud['purchase_time'].dt.dayofweek

# Drop the original datetime columns
X_test_fraud = X_test_fraud.drop(columns=['signup_time', 'purchase_time'])

# Convert boolean columns to integers (0 or 1)
boolean_columns = X_test_fraud.select_dtypes(include=['bool']).columns
X_test_fraud[boolean_columns] = X_test_fraud[boolean_columns].astype(int)

# Drop unnecessary columns (if any)
X_test_fraud = X_test_fraud.drop(columns=['device_id', 'country'])

# Fill missing values in 'transaction_velocity' with 0 (or the same value used in X_train_fraud)
X_test_fraud['transaction_velocity'] = X_test_fraud['transaction_velocity'].fillna(0)

# Verify the dataset
print(X_test_fraud.dtypes)


user_id                     int64
purchase_value            float64
age                       float64
ip_address                  int64
lower_bound_ip_address    float64
upper_bound_ip_address    float64
hour_of_day                 int64
day_of_week                 int64
transaction_frequency       int64
transaction_velocity      float64
source_Direct               int64
source_SEO                  int64
browser_FireFox             int64
browser_IE                  int64
browser_Opera               int64
browser_Safari              int64
sex_M                       int64
signup_hour                 int32
signup_day                  int32
purchase_hour               int32
purchase_day                int32
dtype: object


In [None]:

# Verify the dataset
print(X_test_fraud.isnull().sum())

user_id                   0
purchase_value            0
age                       0
ip_address                0
lower_bound_ip_address    0
upper_bound_ip_address    0
hour_of_day               0
day_of_week               0
transaction_frequency     0
transaction_velocity      0
source_Direct             0
source_SEO                0
browser_FireFox           0
browser_IE                0
browser_Opera             0
browser_Safari            0
sex_M                     0
signup_hour               0
signup_day                0
purchase_hour             0
purchase_day              0
dtype: int64


In [None]:
print("X_train_fraud columns:", X_train_fraud.columns)
print("X_test_fraud columns:", X_test_fraud.columns)

X_train_fraud columns: Index(['user_id', 'purchase_value', 'age', 'ip_address',
       'lower_bound_ip_address', 'upper_bound_ip_address', 'hour_of_day',
       'day_of_week', 'transaction_frequency', 'transaction_velocity',
       'source_Direct', 'source_SEO', 'browser_FireFox', 'browser_IE',
       'browser_Opera', 'browser_Safari', 'sex_M', 'signup_hour', 'signup_day',
       'purchase_hour', 'purchase_day'],
      dtype='object')
X_test_fraud columns: Index(['user_id', 'purchase_value', 'age', 'ip_address',
       'lower_bound_ip_address', 'upper_bound_ip_address', 'hour_of_day',
       'day_of_week', 'transaction_frequency', 'transaction_velocity',
       'source_Direct', 'source_SEO', 'browser_FireFox', 'browser_IE',
       'browser_Opera', 'browser_Safari', 'sex_M', 'signup_hour', 'signup_day',
       'purchase_hour', 'purchase_day'],
      dtype='object')


In [9]:

# 3. Model Training and Evaluation
# Train and evaluate machine learning models
results_ml = train_and_evaluate_models(models, X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud)

# Train and evaluate deep learning models
results_dl = train_and_evaluate_deep_learning_models(deep_learning_models, X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud)




[1m820/820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m820/820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m820/820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
[1m820/820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
[1m820/820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 28ms/step
[1m820/820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step


In [10]:
# Print machine learning model results
print("Machine Learning Models Results:")
for model_name, metrics in results_ml.items():
    print(f"{model_name}:")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1-Score: {metrics['f1_score']:.4f}")
    print(f"  ROC-AUC: {metrics['roc_auc']:.4f}")
    print()

# Print deep learning model results
print("Deep Learning Models Results:")
for model_name, metrics in results_dl.items():
    print(f"{model_name}:")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1-Score: {metrics['f1_score']:.4f}")
    print(f"  ROC-AUC: {metrics['roc_auc']:.4f}")
    print()

Machine Learning Models Results:
Logistic Regression:
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
  ROC-AUC: 0.5019

Decision Tree:
  Precision: 0.4687
  Recall: 0.5789
  F1-Score: 0.5180
  ROC-AUC: 0.7549

Random Forest:
  Precision: 0.9936
  Recall: 0.5561
  F1-Score: 0.7131
  ROC-AUC: 0.7743

Gradient Boosting:
  Precision: 0.8182
  Recall: 0.0036
  F1-Score: 0.0072
  ROC-AUC: 0.7350

MLP:
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
  ROC-AUC: 0.5000

Deep Learning Models Results:
CNN:
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
  ROC-AUC: 0.6227

RNN:
  Precision: 0.8529
  Recall: 0.5365
  F1-Score: 0.6586
  ROC-AUC: 0.7767

LSTM:
  Precision: 0.9065
  Recall: 0.5437
  F1-Score: 0.6797
  ROC-AUC: 0.7770



In [11]:
import joblib


# Save the best machine learning model
best_ml_model = max(results_ml, key=lambda k: results_ml[k]['f1_score'])
joblib.dump(models[best_ml_model], f'../models/{best_ml_model.lower().replace(" ", "_")}_fraud.pkl')

# Save the best deep learning model using the recommended format
best_dl_model = max(results_dl, key=lambda k: results_dl[k]['f1_score'])
deep_learning_models[best_dl_model].save(f'../models/{best_dl_model.lower()}_fraud.keras')

In [13]:
# 2 Model Training and Evaluation for Credit Card Data
# Train and evaluate machine learning models
results_ml_cc = train_and_evaluate_models(models, X_train_cc, X_test_cc, y_train_cc, y_test_cc)

# Train and evaluate deep learning models
input_shape_cc = (X_train_cc.shape[1], 1)  # Adjust input shape for deep learning models
deep_learning_models_cc = get_deep_learning_models(input_shape_cc)
results_dl_cc = train_and_evaluate_deep_learning_models(deep_learning_models_cc, X_train_cc, X_test_cc, y_train_cc, y_test_cc)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)


[1m1774/1774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step
[1m1774/1774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
[1m1774/1774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step
[1m1774/1774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step
[1m1774/1774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 10ms/step
[1m1774/1774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 9ms/step


In [14]:
# Print machine learning model results for Credit Card Data
print("Machine Learning Models Results for Credit Card Data:")
for model_name, metrics in results_ml_cc.items():
    print(f"{model_name}:")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1-Score: {metrics['f1_score']:.4f}")
    print(f"  ROC-AUC: {metrics['roc_auc']:.4f}")
    print()

# Print deep learning model results for Credit Card Data
print("Deep Learning Models Results for Credit Card Data:")
for model_name, metrics in results_dl_cc.items():
    print(f"{model_name}:")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1-Score: {metrics['f1_score']:.4f}")
    print(f"  ROC-AUC: {metrics['roc_auc']:.4f}")
    print()


Machine Learning Models Results for Credit Card Data:
Logistic Regression:
  Precision: 0.8788
  Recall: 0.6105
  F1-Score: 0.7205
  ROC-AUC: 0.9408

Decision Tree:
  Precision: 0.6947
  Recall: 0.6947
  F1-Score: 0.6947
  ROC-AUC: 0.8471

Random Forest:
  Precision: 0.9726
  Recall: 0.7474
  F1-Score: 0.8452
  ROC-AUC: 0.9243

Gradient Boosting:
  Precision: 0.8732
  Recall: 0.6526
  F1-Score: 0.7470
  ROC-AUC: 0.8539

MLP:
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
  ROC-AUC: 0.4954

Deep Learning Models Results for Credit Card Data:
CNN:
  Precision: 0.9706
  Recall: 0.3474
  F1-Score: 0.5116
  ROC-AUC: 0.9459

RNN:
  Precision: 0.8987
  Recall: 0.7474
  F1-Score: 0.8161
  ROC-AUC: 0.9486

LSTM:
  Precision: 0.8933
  Recall: 0.7053
  F1-Score: 0.7882
  ROC-AUC: 0.9472



In [15]:
import joblib


# Save the best machine learning model for Credit Card Data
best_ml_model_cc = max(results_ml_cc, key=lambda k: results_ml_cc[k]['f1_score'])
joblib.dump(models[best_ml_model_cc], f'../models/{best_ml_model_cc.lower().replace(" ", "_")}_creditcard.pkl')

# Save the best deep learning model using the recommended format for Credit Card Data
best_dl_model_cc = max(results_dl_cc, key=lambda k: results_dl_cc[k]['f1_score'])
deep_learning_models_cc[best_dl_model_cc].save(f'../models/{best_dl_model_cc.lower()}_creditcard.keras')

In [None]:
# 4. MLOps Steps
# Log experiments using MLflow
for name, model in models.items():
    log_experiment(model, name, X_test_fraud, y_test_fraud)

for name, model in deep_learning_models.items():
    log_experiment(model, name, X_test_fraud, y_test_fraud)