# Libraries

In [1]:
from datetime import datetime
current_time = datetime.now()
print("Main Process Start date and time:", current_time)

Main Process Start date and time: 2024-10-09 16:27:19.299885


In [2]:
#!pip install imbalanced-learn

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import tensorflow as tf
from tensorflow.keras import layers, models

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

import seaborn as sns
from pylab import rcParams
#from matplotlib import pyplot as plt
%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]

# Load Dataset

In [None]:
# df = pd.read_csv("Kaggle-credit-card-fraud-detection_18Sep23/creditcard_2023.csv")
df = pd.read_csv("Kaggle-credit-card-fraud-detection-US_13Jul24/creditcard.csv")


In [None]:
print(df.head())

# Exploration

In [None]:
df.shape

In [None]:
df.isnull().values.any()

In [None]:
count_classes = pd.value_counts(df['Class'], sort = True)
count_classes.plot(kind = 'bar', rot=0)
plt.title("Transaction class distribution")
plt.xticks(range(2), LABELS)
plt.xlabel("Class")
plt.ylabel("Frequency");

We have a highly imbalanced dataset on our hands. Normal transactions overwhelm the fraudulent ones by a large margin. Let's look at the two types of transactions:

In [None]:
frauds = df[df.Class == 1]
normal = df[df.Class == 0]

In [None]:
frauds.shape

In [None]:
normal.shape

# 1. Data Preprocessing

### Separate features and labels

In [None]:
X = df.drop(['Class'], axis=1)  # 'Class' is the fraud label
y = df['Class']

In [None]:
print("X.shape:", X.shape)
print("y.shape:", y.shape)

### Split the data into training and testing sets
### (X_train, X_test, y_train, y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
print("X_train.shape:", X_train.shape)
print("X_test.shape:", X_test.shape)
print("y_train.shape:", y_train.shape)
print("y_test.shape:", y_test.shape)

# 2. Handling Imbalanced Data with SMOTE (Moving SMOTE before Autoencoder)
### (X_train_balanced, y_train_balanced)

1. Over-sampling: SMOTE doesn't just duplicate existing minority class instances. Instead, it creates new synthetic samples by interpolating between existing minority class examples.
2. Synthetic Samples: For each instance in the minority class (fraud cases), SMOTE randomly selects one of its nearest neighbors and generates a synthetic point along the line connecting the two instances.

In [None]:
sm = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = sm.fit_resample(X_train, y_train)

In [None]:
X_train_balanced.shape

### Standardize the features after applying SMOTE
### (X_train_scaled, X_test_scaled)

Feature scaling is applied using StandardScaler, which standardizes the features by removing the mean and scaling them to unit variance.

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

In [None]:
print("X_train_scaled.shape[1]:", X_train_scaled.shape[1])
print("X_test_scaled.shape:", X_test_scaled.shape)

# 3. Unsupervised Learning with Autoencoder for Anomaly Detection
### (X_train_scaled2, X_test_scaled2)

In [None]:
def build_autoencoder(input_dim):
    input_layer = layers.Input(shape=(input_dim,))
    encoder = layers.Dense(16, activation="relu")(input_layer)
    encoder = layers.Dense(8, activation="relu")(encoder)
    decoder = layers.Dense(16, activation='relu')(encoder)
    decoder = layers.Dense(input_dim, activation='sigmoid')(decoder)
    autoencoder = models.Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer='adam', loss='mse')
    return autoencoder

In [None]:
from datetime import datetime
current_time = datetime.now()
print("20 EPOCHS Start date and time:", current_time)

In [None]:
print("input_dim:", X_train_scaled.shape[1])

In [None]:
input_dim = X_train_scaled.shape[1]
autoencoder = build_autoencoder(input_dim)

autoencoder.fit(X_train_scaled, X_train_scaled, epochs=20, batch_size=256, validation_data=(X_test_scaled, X_test_scaled), verbose=1)

In [None]:
from datetime import datetime
current_time = datetime.now()
print("20 EPOCHS End date and time:", current_time)

### Generate reconstruction errors
### (train_reconstructions, train_mse, test_reconstructions, test_mse)

In [None]:
train_reconstructions = autoencoder.predict(X_train_scaled)
train_mse = np.mean(np.power(X_train_scaled - train_reconstructions, 2), axis=1)

In [None]:
test_reconstructions = autoencoder.predict(X_test_scaled)
test_mse = np.mean(np.power(X_test_scaled - test_reconstructions, 2), axis=1)

### Add the reconstruction error as a feature to the original dataset
### (X_train_scaled2, X_test_scaled2)

In [None]:
#X_train_balanced = pd.DataFrame(X_train_balanced)
#X_train_balanced['reconstruction_error'] = train_mse
X_train_scaled2 = pd.DataFrame(X_train_scaled)
X_train_scaled2['reconstruction_error'] = train_mse

In [None]:
#X_test = pd.DataFrame(X_test)
#X_test['reconstruction_error'] = test_mse
X_test_scaled2 = pd.DataFrame(X_test_scaled)
X_test_scaled2['reconstruction_error'] = test_mse

In [None]:
print("X_train_scaled2.shape:", X_train_scaled2.shape)
print("X_test_scaled2.shape:", X_test_scaled2.shape)

# 4. Unsupervised Learning with KMeans Clustering

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42)
#kmeans.fit(X_train_scaled)
kmeans.fit(X_train_scaled2)

### Add the cluster labels as features
### (X_train_scaled3)

In [None]:
#X_train_balanced['cluster_label'] = kmeans.labels_
#X_test['cluster_label'] = kmeans.predict(X_test_scaled)
X_train_scaled2['cluster_label'] = kmeans.labels_
X_test_scaled2['cluster_label'] = kmeans.predict(X_test_scaled2)

# Concatenate Step2 Balanced Data + Step4 Unsupervised model data

In [None]:
X_train_combined = np.concatenate([X_train_scaled, X_train_scaled2[['reconstruction_error', 'cluster_label']].values], axis=1)
X_test_combined = np.concatenate([X_test_scaled, X_test_scaled2[['reconstruction_error', 'cluster_label']].values], axis=1)

# 5. Supervised Learning using Deep learning model

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [None]:
input_dim=X_train_balanced.shape[1]
input_dim

In [None]:
print("X_train_scaled2.shape:", X_train_scaled2.shape)


### model architecture

In [None]:
model = Sequential()
model.add(Dense(64, input_dim=X_train_combined.shape[1], activation='relu'))  # Adjusted input_dim for combined features
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Binary classification output

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
from datetime import datetime
current_time = datetime.now()
print("Model Training start:", current_time)

In [None]:
# Train
model.fit(X_train_combined, 
          y_train_balanced, 
          epochs=20, 
          batch_size=256, 
          validation_data=(X_test_combined, y_test), verbose=1)

In [None]:
from datetime import datetime
current_time = datetime.now()
print("Model Training end:", current_time)

### Make predictions

In [None]:
y_pred = (model.predict(X_test_combined) > 0.5).astype("int32")

# 6. Model Evaluation

In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

### ROC AUC Score

In [None]:
roc_score = roc_auc_score(y_test, y_pred)
print(f"ROC AUC Score: {roc_score}")

### Confusion Matrix

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 7. Plotting ROC Curve (Optional)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

In [None]:
y_probas = model.predict(X_test_scaled)
fpr, tpr, thresholds = roc_curve(y_test, y_probas)

In [None]:
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (area = {roc_score:.2f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

In [None]:
from datetime import datetime
current_time = datetime.now()
print("Main Process End date and time:", current_time)