In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Example dataset: creditcard.csv (Kaggle)
data = pd.read_csv("dataset/creditcardfraud-csv/creditcard.csv")

print("Dataset shape:", data.shape)
print(data["Class"].value_counts())

# Separate features and labels
X = data.drop("Class", axis=1)
y = data["Class"]

# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train only on NORMAL (Class = 0) samples
X_normal = X_scaled[y == 0]

# Split normal data for training & validation
X_train, X_val = train_test_split(X_normal, test_size=0.2, random_state=42)

print("Normal train shape:", X_train.shape)
print("Normal val shape:", X_val.shape)

Dataset shape: (284807, 31)
Class
0    284315
1       492
Name: count, dtype: int64
Normal train shape: (227452, 30)
Normal val shape: (56863, 30)


In [3]:
# c. Encoder converts input → latent representation

input_dim = X_train.shape[1]
encoding_dim = 16   # latent dimension

input_layer = layers.Input(shape=(input_dim,))

# Encoder network
encoder = layers.Dense(32, activation="relu")(input_layer)
encoder = layers.Dense(16, activation="relu")(encoder)
latent = layers.Dense(encoding_dim, activation="relu")(encoder)

In [4]:
# d. Decoder converts latent → reconstruct original

decoder = layers.Dense(16, activation="relu")(latent)
decoder = layers.Dense(32, activation="relu")(decoder)
output_layer = layers.Dense(input_dim, activation="linear")(decoder)

# Autoencoder model
autoencoder = models.Model(inputs=input_layer, outputs=output_layer)

autoencoder.summary()

In [5]:
# e. Compile the model with Optimizer, Loss, Metrics

autoencoder.compile(
    optimizer="adam",
    loss="mse",           # reconstruction loss
    metrics=["mae"]
)

# Train the Autoencoder
history = autoencoder.fit(
    X_train, X_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, X_val),
    verbose=1
)

Epoch 1/10
[1m3554/3554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.4288 - mae: 0.4353 - val_loss: 0.2838 - val_mae: 0.3511
Epoch 2/10
[1m3554/3554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 0.2252 - mae: 0.3079 - val_loss: 0.1925 - val_mae: 0.2720
Epoch 3/10
[1m3554/3554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 0.1727 - mae: 0.2588 - val_loss: 0.1599 - val_mae: 0.2364
Epoch 4/10
[1m3554/3554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.1364 - mae: 0.2220 - val_loss: 0.1287 - val_mae: 0.1973
Epoch 5/10
[1m3554/3554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 0.1149 - mae: 0.2002 - val_loss: 0.1026 - val_mae: 0.1953
Epoch 6/10
[1m3554/3554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - loss: 0.0951 - mae: 0.1863 - val_loss: 0.0860 - val_mae: 0.1766
Epoch 7/10
[1m3554/3554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12

In [6]:

# Evaluate reconstruction error on test data

# 1. Compute threshold using ONLY normal validation data
val_recon = autoencoder.predict(X_val)
val_mse = np.mean(np.power(X_val - val_recon, 2), axis=1)

# Correct threshold
threshold = np.percentile(val_mse, 95)
print("\nCorrect Reconstruction Error Threshold:", threshold)

# 2. Compute reconstruction error for ALL samples
reconstructions = autoencoder.predict(X_scaled)
mse = np.mean(np.power(X_scaled - reconstructions, 2), axis=1)

# 3. Add reconstruction error to dataset
data["Reconstruction_Error"] = mse

# 4. Predict anomalies (fraud = 1)
data["Predicted_Class"] = (mse > threshold).astype(int)

# 5. Confusion Matrix
print("\nConfusion Matrix:")
print(pd.crosstab(data["Class"], data["Predicted_Class"]))


[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 943us/step

Correct Reconstruction Error Threshold: 0.2548403146955442
[1m8901/8901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 902us/step

Confusion Matrix:
Predicted_Class       0      1
Class                         
0                270052  14263
1                    78    414
