In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, precision_recall_curve
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

2024-02-26 07:28:38.159159: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Load the dataset
df = pd.read_csv('/home/administrator/Documents/PROGRAMMING/ML/Datasets/Fraud.csv')
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
df.isnull().sum() #There are no missing values present in the dataset

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [5]:
df.shape

(6362620, 11)

In [6]:
print(df.dtypes)

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object


In [7]:
df = df.drop(columns=['step', 'isFlaggedFraud', 'nameOrig', 'nameDest'], axis = 1) #These attributes are not required moving forward

In [8]:
df.sample(10)

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
3230979,PAYMENT,49129.37,0.0,0.0,0.0,0.0,0
2075316,CASH_OUT,461488.02,86094.0,0.0,271754.48,733242.5,0
164355,CASH_IN,174226.21,719034.92,893261.14,802530.53,628304.31,0
2338744,CASH_IN,183007.08,806268.9,989275.98,1263463.57,1080456.49,0
4203154,PAYMENT,77524.93,30949.0,0.0,0.0,0.0,0
71869,CASH_IN,217951.41,4892988.54,5110939.95,343788.42,1930417.62,0
708256,CASH_IN,97672.24,2596167.62,2693839.86,814120.92,509128.75,0
315490,CASH_IN,21236.66,9589223.81,9610460.48,193608.14,101260.97,0
1642442,CASH_OUT,156976.31,39442.0,0.0,94751.63,251727.94,0
3957327,CASH_OUT,70853.67,0.0,0.0,9067214.94,9138068.62,0


In [11]:
df['isFraud'].value_counts() #Checking if there is a class imbalance in the target variable

isFraud
0    6354407
1       8213
Name: count, dtype: int64

In [12]:
le = LabelEncoder()
df['type'] = le.fit_transform(df['type']) #Label Encoding attribute 'type' so that we can apply SMOTE

In [13]:
df.sample(10)

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
2549264,4,123389.79,0.0,0.0,387362.83,510752.62,0
4763088,3,7648.29,51516.0,43867.71,0.0,0.0,0
2580472,3,3395.35,0.0,0.0,0.0,0.0,0
853077,1,121825.05,0.0,0.0,2014607.33,2136432.39,0
3053512,4,148606.12,487698.0,339091.88,328257.32,476863.44,0
1700883,1,42385.1,0.0,0.0,319588.31,361973.4,0
3391739,0,173538.19,8904943.0,9078481.19,991262.53,817724.35,0
2428212,3,4697.05,0.0,0.0,0.0,0.0,0
1830914,1,230409.31,28.0,0.0,20567.0,250976.31,0
1305892,4,477078.52,0.0,0.0,1020952.73,1082189.73,0


In [14]:
from imblearn.over_sampling import SMOTE
#There is a lot of class imbalance, hence we are using SMOTE for upsampling

x = df.drop(columns = ['isFraud'], axis=1)
y = df['isFraud']

smote = SMOTE(random_state=42)
X, Y = smote.fit_resample(x, y)

In [15]:
print(Y.value_counts()) #class imbalance fixed

isFraud
0    6354407
1    6354407
Name: count, dtype: int64


In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42) #Dividing the dataset into training and testing set in the ratio 70:30

In [17]:
#scaling the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
# Define the MLP model architecture
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

2024-02-26 07:36:29.780603: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [19]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(X_train_scaled, Y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
 39203/222405 [====>.........................] - ETA: 13:18 - loss: 0.0848 - accuracy: 0.9690

In [None]:
# Evaluate the model
Y_pred = model.predict(X_test_scaled)
Y_pred_binary = (Y_pred > 0.5).astype(int)

In [None]:
#Evaluation metrics
accuracy = accuracy_score(Y_test, Y_pred_binary)
precision = precision_score(Y_test, Y_pred_binary)
recall = recall_score(Y_test, Y_pred_binary)
f1 = f1_score(Y_test, Y_pred_binary)
roc_auc = roc_auc_score(Y_test, Y_pred)
conf_matrix = confusion_matrix(Y_test, Y_pred_binary)

In [None]:
#Printing the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)

In [None]:
# Plotting Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred)
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()