In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Conv1D, Flatten
import mlflow
import mlflow.sklearn
from imblearn.over_sampling import SMOTE

In [2]:
fraud_data = pd.read_csv("fraud_data_processed.csv")
credit_data = pd.read_csv("creditcard.csv")

In [3]:
fraud_data.head(10)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,transaction_count,hour_of_day,day_of_week,country
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,-0.160204,QVPSPJUOCKZAR,2,0,1,39,732758368,0,1,2,5,84
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,-1.142592,EOGFQPIZPYXFZ,0,0,0,53,350311387,0,1,1,0,171
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,-1.197169,YSSKYOSJHPPLJ,2,3,1,53,2621473820,1,1,18,3,171
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,0.385567,ATGTXKYKUDUQN,2,4,1,41,3840542443,0,1,13,0,172
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,0.112681,NAUITBZFJKHWW,0,4,1,45,415583117,0,1,18,2,171
5,159135,2015-05-21 06:03:03,2015-07-09 08:05:14,0.276413,ALEYXFXINSXLZ,0,0,1,18,2809315199,0,1,8,3,32
6,50116,2015-08-01 22:40:52,2015-08-27 03:37:57,-1.415478,IWKVZHJOCLPUR,0,0,0,19,3987484328,0,1,3,3,172
7,360585,2015-04-06 07:35:45,2015-05-25 17:21:14,-0.542244,HPUCUYLMJBYFW,0,3,1,34,1692458727,0,1,17,0,171
8,159045,2015-04-21 23:38:34,2015-06-02 14:01:54,-0.378513,ILXYDOZIHOOHT,2,2,0,43,3719094257,0,1,14,1,36
9,182338,2015-01-25 17:49:49,2015-03-23 23:05:42,1.367955,NRFFPPHZYFUVC,0,2,1,31,341674739,0,1,23,0,171


In [4]:
fraud_data.isnull().sum()

Unnamed: 0,0
user_id,0
signup_time,0
purchase_time,0
purchase_value,0
device_id,0
source,0
browser,0
sex,0
age,0
ip_address,0


In [10]:
# Drop non-numeric columns
X_fraud = fraud_data.drop(columns=["user_id", "signup_time", "purchase_time", "device_id"])

# Convert remaining columns to float
X_fraud = X_fraud.astype(float)

# Verify data types
print(X_fraud.dtypes)

purchase_value       float64
source               float64
browser              float64
sex                  float64
age                  float64
ip_address           float64
class                float64
transaction_count    float64
hour_of_day          float64
day_of_week          float64
country              float64
dtype: object


In [11]:
# Features & Target Separation
X_fraud = X_fraud.drop(columns=["class"])
y_fraud = fraud_data["class"]

X_credit = credit_data.drop(columns=["Class"])
y_credit = credit_data["Class"]

# Train-Test Split (80-20)
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42)

In [6]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [12]:
# Train and evaluate models
results = []
for name, model in models.items():
    model.fit(X_train_fraud, y_train_fraud)
    y_pred = model.predict(X_test_fraud)

    accuracy = accuracy_score(y_test_fraud, y_pred)
    precision = precision_score(y_test_fraud, y_pred)
    recall = recall_score(y_test_fraud, y_pred)
    f1 = f1_score(y_test_fraud, y_pred)

    results.append([name, accuracy, precision, recall, f1])

# Convert results to DataFrame
model_results = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])
print(model_results.to_markdown(index=False))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| Model               |   Accuracy |   Precision |      Recall |    F1-Score |
|:--------------------|-----------:|------------:|------------:|------------:|
| Logistic Regression |   0.905701 |    0        | 0           | 0           |
| Decision Tree       |   0.898124 |    0.467145 | 0.571228    | 0.51397     |
| Random Forest       |   0.956258 |    0.996104 | 0.538246    | 0.698861    |
| Gradient Boosting   |   0.905734 |    1        | 0.000350877 | 0.000701508 |


In [14]:
# Convert data to NumPy for deep learning models
X_train_fraud_np = X_train_fraud.to_numpy()
X_test_fraud_np = X_test_fraud.to_numpy()

In [15]:
# Define an MLP model
mlp_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_fraud_np.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train MLP
mlp_model.fit(X_train_fraud_np, y_train_fraud, epochs=10, batch_size=32, validation_data=(X_test_fraud_np, y_test_fraud))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - accuracy: 0.8320 - loss: 1898390.8750 - val_accuracy: 0.9057 - val_loss: 4083920.7500
Epoch 2/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 3ms/step - accuracy: 0.8365 - loss: 999465.1875 - val_accuracy: 0.9057 - val_loss: 165960.1875
Epoch 3/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.8349 - loss: 508698.5625 - val_accuracy: 0.9057 - val_loss: 482425.9062
Epoch 4/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.8316 - loss: 319502.3125 - val_accuracy: 0.9057 - val_loss: 298311.2812
Epoch 5/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3ms/step - accuracy: 0.8295 - loss: 279917.8750 - val_accuracy: 0.9057 - val_loss: 85871.8281
Epoch 6/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.8304 - loss: 150828.2

<keras.src.callbacks.history.History at 0x7a6c1e4fe090>

In [20]:
# Evaluate MLP
mlp_results = mlp_model.evaluate(X_test_fraud_np, y_test_fraud)
print("MLP Accuracy:", mlp_results[1])

[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9092 - loss: 0.3044
MLP Accuracy: 0.9057009816169739


In [21]:
# Reshape data for CNN input
X_train_fraud_cnn = X_train_fraud.to_numpy().reshape(X_train_fraud.shape[0], X_train_fraud.shape[1], 1)
X_test_fraud_cnn = X_test_fraud.to_numpy().reshape(X_test_fraud.shape[0], X_test_fraud.shape[1], 1)

# CNN Model
cnn_model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_fraud.shape[1], 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train CNN
cnn_model.fit(X_train_fraud_cnn, y_train_fraud, epochs=10, batch_size=32, validation_data=(X_test_fraud_cnn, y_test_fraud))

# Evaluate CNN
cnn_results = cnn_model.evaluate(X_test_fraud_cnn, y_test_fraud)
print("CNN Accuracy:", cnn_results[1])


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 4ms/step - accuracy: 0.8964 - loss: 1032141.6875 - val_accuracy: 0.9057 - val_loss: 0.3137
Epoch 2/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - accuracy: 0.9057 - loss: 0.3128 - val_accuracy: 0.9057 - val_loss: 0.3124
Epoch 3/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - accuracy: 0.9073 - loss: 0.3088 - val_accuracy: 0.9057 - val_loss: 0.3124
Epoch 4/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 4ms/step - accuracy: 0.9048 - loss: 0.3143 - val_accuracy: 0.9057 - val_loss: 0.3124
Epoch 5/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 5ms/step - accuracy: 0.9061 - loss: 0.3114 - val_accuracy: 0.9057 - val_loss: 0.3124
Epoch 6/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 3ms/step - accuracy: 0.9061 - loss: 0.3115 - val_accuracy: 0.9057 - val_loss: 0.3124
Epoch 7/10
[1m

In [22]:
# Reshape data for LSTM input (3D: [samples, time steps, features])
X_train_fraud_lstm = X_train_fraud.to_numpy().reshape(X_train_fraud.shape[0], 1, X_train_fraud.shape[1])
X_test_fraud_lstm = X_test_fraud.to_numpy().reshape(X_test_fraud.shape[0], 1, X_test_fraud.shape[1])

# LSTM Model
lstm_model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(1, X_train_fraud.shape[1])),
    LSTM(32, return_sequences=False),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train LSTM
lstm_model.fit(X_train_fraud_lstm, y_train_fraud, epochs=10, batch_size=32, validation_data=(X_test_fraud_lstm, y_test_fraud))

# Evaluate LSTM
lstm_results = lstm_model.evaluate(X_test_fraud_lstm, y_test_fraud)
print("LSTM Accuracy:", lstm_results[1])

  super().__init__(**kwargs)


Epoch 1/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 6ms/step - accuracy: 0.9022 - loss: 0.3430 - val_accuracy: 0.9057 - val_loss: 0.3125
Epoch 2/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 5ms/step - accuracy: 0.9063 - loss: 0.3170 - val_accuracy: 0.9057 - val_loss: 0.3131
Epoch 3/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 6ms/step - accuracy: 0.9070 - loss: 0.3118 - val_accuracy: 0.9057 - val_loss: 0.3124
Epoch 4/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 5ms/step - accuracy: 0.9071 - loss: 0.3099 - val_accuracy: 0.9057 - val_loss: 0.3125
Epoch 5/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 6ms/step - accuracy: 0.9074 - loss: 0.3088 - val_accuracy: 0.9057 - val_loss: 0.3124
Epoch 6/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 5ms/step - accuracy: 0.9058 - loss: 0.3123 - val_accuracy: 0.9057 - val_loss: 0.3124
Epoch 7/10

In [19]:
mlflow.set_experiment("Fraud Detection Experiment")

with mlflow.start_run():
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X_train_fraud, y_train_fraud)

    y_pred = model.predict(X_test_fraud)
    acc = accuracy_score(y_test_fraud, y_pred)

    mlflow.log_param("n_estimators", 100)
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(model, "random_forest_fraud")

2025/02/11 06:55:07 INFO mlflow.tracking.fluent: Experiment with name 'Fraud Detection Experiment' does not exist. Creating a new experiment.


In [25]:
# Feature & Target Separation
X_credit = credit_data.drop(columns=["Class"])
y_credit = credit_data["Class"]

# Handle class imbalance using SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_credit_resampled, y_credit_resampled = smote.fit_resample(X_credit, y_credit)

# Train-Test Split
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit_resampled, y_credit_resampled, test_size=0.2, random_state=42)

# Train Random Forest (Best Model So Far)
rf_credit = RandomForestClassifier(n_estimators=100)
rf_credit.fit(X_train_credit, y_train_credit)
y_pred_credit = rf_credit.predict(X_test_credit)

# Evaluate Model
accuracy = accuracy_score(y_test_credit, y_pred_credit)
precision = precision_score(y_test_credit, y_pred_credit)
recall = recall_score(y_test_credit, y_pred_credit)
f1 = f1_score(y_test_credit, y_pred_credit)

# Print Results
credit_results = pd.DataFrame([["Random Forest (Credit Data)", accuracy, precision, recall, f1]],
                              columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])
print(credit_results.to_markdown(index=False))


| Model                       |   Accuracy |   Precision |   Recall |   F1-Score |
|:----------------------------|-----------:|------------:|---------:|-----------:|
| Random Forest (Credit Data) |   0.999871 |    0.999649 | 0.999965 |   0.999807 |
