In [None]:
# Import required libraries
# pandas: For data manipulation and analysis
# numpy: For numerical computations
# tensorflow: For building and training the autoencoder model
import pandas as pd
import numpy as np
import tensorflow as tf

In [None]:
# Read the credit card transaction dataset from CSV file
# The dataset contains various features of credit card transactions
df = pd.read_csv('creditcard.csv')

In [None]:
# Display the contents of the DataFrame
# This helps us inspect the data and understand its structure
df

In [None]:
# Check the dimensions of the dataset
# Returns a tuple (number of rows, number of columns)
df.shape

In [None]:
# Remove 'Time' and 'Class' columns from the dataset
# Time: Not relevant for anomaly detection
# Class: We don't use labels in unsupervised anomaly detection
df = df.drop(['Time','Class'],axis=1)

In [None]:
# Display the DataFrame after removing Time and Class columns
# Verify that the columns were successfully removed
df

In [None]:
# Import required preprocessing tools from scikit-learn
# StandardScaler: For standardizing the features by removing the mean and scaling to unit variance
# train_test_split: For splitting the dataset into training and testing sets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
# Preprocess the data:
# 1. Remove any rows with missing values
# 2. Standardize the features (zero mean and unit variance)
# 3. Split data into training (80%) and testing (20%) sets
df.dropna(inplace=True)
scaler = StandardScaler()
df = scaler.fit_transform(df)
x_train,x_test = train_test_split(df,test_size=0.2)

In [None]:
# Check the shape of training data
# This will help verify the split ratio and number of features
x_train.shape

In [None]:
# Import necessary layers from Keras for building the autoencoder
# Sequential: For creating the model layer by layer
# Dense: Fully connected layer
# Dropout: For regularization to prevent overfitting
# Input: For specifying input shape
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, Conv2D, MaxPooling2D, Input

In [None]:
# Build and train the autoencoder model

# Create an autoencoder with symmetric encoder and decoder architecture
model = Sequential([
    # Encoder part
    Input(shape=(x_train.shape[1],)),     # Input layer matching feature dimensions
    Dense(32, activation='relu'),          # Compress to 32 dimensions
    Dropout(0.2),                         # Prevent overfitting
    Dense(16, activation='relu'),          # Further compress to 16 dimensions
    Dropout(0.2),
    Dense(8, activation='relu'),           # Bottleneck layer - 8 dimensions

    # Decoder part - mirror the encoder architecture
    Dense(16, activation='relu'),          # Start expanding back
    Dropout(0.2),
    Dense(32, activation='relu'),          # Continue expanding
    Dense(x_train.shape[1], activation='linear')  # Output layer matching input dimensions
])

# Configure the model for training
model.compile(optimizer='adam',            # Adam optimizer for efficient training
             loss='mean_squared_error')    # MSE loss for reconstruction error

# Train the autoencoder
# Note: Input = Output in autoencoders
history = model.fit(x_train,
                   x_train,                # Same data for input and target
                   epochs=20,              # Number of training iterations
                   validation_data=(x_test, x_test),  # Validation data
                   batch_size=30,          # Number of samples per gradient update
                   shuffle=True)           # Shuffle data each epoch

# Plot training history
import seaborn as sns
sns.lineplot(model.history.history)

In [None]:
# Calculate reconstruction error for anomaly detection
# 1. Get model predictions (reconstructions) for test data
# 2. Calculate Mean Squared Error between original and reconstructed data
# Higher MSE indicates potential anomalies
predictions = model.predict(x_test)
mse = np.mean(np.power(x_test - predictions, 2), axis=1)

In [None]:
# Set anomaly threshold at 95th percentile of MSE values
# This means we consider the top 5% of reconstruction errors as anomalies
# Adjust percentile based on expected anomaly rate in your data
threshold = np.percentile(mse, 95)  # Using 95th percentile
threshold

In [None]:
# Create boolean mask for anomalies
# True where MSE > threshold (anomalies)
# False where MSE <= threshold (normal transactions)
anomalies = mse > threshold

In [None]:
# Count and display the total number of detected anomalies
# This helps understand how many transactions were flagged as fraudulent
num_anomalies = np.sum(anomalies)
print(f"Number of Anomalies: {num_anomalies}")

In [None]:
# Visualize the distribution of reconstruction errors (MSE)
# - Blue dots: MSE for each sample
# - Red line: Anomaly threshold
# Points above the red line are considered anomalies
import matplotlib.pyplot as plt

plt.plot(mse, marker='o', linestyle='', markersize=3, label='MSE')
plt.axhline(threshold, color='r', linestyle='--', label='Anomaly Threshold')
plt.xlabel('Sample Index')
plt.ylabel('MSE')
plt.title('Anomaly Detection Results')
plt.legend()
plt.show()

In [None]:
# Plot comparison of original vs reconstructed data for a normal transaction
# This visualization helps understand how well the autoencoder reconstructs normal patterns
plt.figure(figsize=(12, 6))
plt.plot(x_test[0], label='Original Transaction')
plt.plot(predictions[0], label='Reconstructed Transaction')
plt.xlabel('Feature Index')
plt.ylabel('Normalized Value')
plt.legend()
plt.title('Normal Transaction: Original vs Reconstructed')
plt.show()

In [None]:
# Create and display confusion matrix for anomaly detection
# Note: In this case, we're comparing predictions against themselves
# This is just to visualize the distribution of anomalies
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

sns.heatmap(confusion_matrix(anomalies, anomalies), annot = True, fmt = 'd')
plt.xlabel("Predicted label", fontsize = 14)
plt.ylabel("True label", fontsize = 14)
plt.title("Confusion Matrix", fontsize = 14)
plt.show()