In [1]:
import pandas as pd #for data-wrangling(drop columns, filter rows, etc.)
import numpy as np #compute reconstruction errors (MSE)
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score
import tensorflow as tf #Base TensorFlow import; Keras runs on top of it
from tensorflow.keras.models import Model #lets you define the autoencoder as a graph
from tensorflow.keras.layers import Input, Dense 
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt #Matplotlib for plots

In [2]:
df = pd.read_csv(r"C:\Users\hp\OneDrive\Desktop\LP4\creditcard.csv")

X = df.drop(['Time', 'Class'], axis = 1).values
y = df['Class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [3]:
input_dim = X_scaled.shape[1] #29
X_normal = X_scaled[y == 0]
X_train_normal, X_val_normal = train_test_split(X_normal, test_size=0.2, random_state=42)
print(f"Data ready. Input dimension: {input_dim} features.")
print(f"Training Autoencoder on {X_train_normal.shape[0]} normal transactions.")

Data ready. Input dimension: 29 features.
Training Autoencoder on 227452 normal transactions.


In [None]:
# #dataset has time, V1–V28 — PCA-transformed features for privacy, amount, class(label)

# # Load the dataset
# df = pd.read_csv(r"C:\Users\hp\OneDrive\Desktop\LP4\creditcard.csv")

# # 1. Separate features (X) and target (y)
# X = df.drop(['Time', 'Class'], axis=1)
# y = df['Class']

# # 2. Scale the data
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
# input_dim = X_scaled.shape[1] # Number of features = 29 (V1 to V28 + Amount)
# # X_scaled.shape gives (rows, columns) = (number of samples, number of features)
# # X_scaled.shape[1] extracts the number of columns/features (here it’s 29).
# # You’ll use input_dim later to define the input layer of your autoencoder.

# # 3. Isolate NORMAL (non-fraudulent) transactions for training and validation
# X_normal = X_scaled[y == 0] # selects only the column where there is no fraud
# X_train_normal, X_val_normal = train_test_split(
#     X_normal,
#     test_size=0.2,
#     random_state=42
# )

# print(f"Data ready. Input dimension: {input_dim} features.")
# print(f"Training Autoencoder on {X_train_normal.shape[0]} normal transactions.")

Data ready. Input dimension: 29 features.
Training Autoencoder on 227452 normal transactions.


In [18]:
# Encoding
latent_dim = 14
intermediate_dim = 24
input_layer = Input(shape = (input_dim,), name = "Input_Layer")
encoded = Dense(latent_dim, activation = "relu", name = "Encoded_L1")(input_layer)
latent_representation = Dense(intermediate_dim, activation = "relu", name = "Latent_Representation")(encoded)

# Decoding
decoded = Dense(intermediate_dim, activation = "relu", name = "Decoder_L1")(latent_representation)
output_layer = Dense(input_dim, activation = "linear", name = "Output_Layer")(decoded)

autoencoder = Model(inputs = input_layer, outputs = output_layer, name = "Anomaly_Autoencoder")

In [22]:
adam = Adam(0.001)
autoencoder.compile(loss = "mse", optimizer = adam, metrics = ["accuracy"])
autoencoder.summary()

In [None]:
# Note that the input and output are identical (X_train_normal, X_train_normal),
# as the goal is self-reconstruction.
H = autoencoder.fit(X_train_normal, X_train_normal, epochs = 20, validation_data = (X_val_normal, X_val_normal), batch_size = 128, shuffle = True)

Epoch 1/20
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.3447 - loss: 0.4929 - val_accuracy: 0.4460 - val_loss: 0.3262
Epoch 2/20
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.4741 - loss: 0.2820 - val_accuracy: 0.5045 - val_loss: 0.2475
Epoch 3/20
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.5270 - loss: 0.2249 - val_accuracy: 0.5428 - val_loss: 0.2079
Epoch 4/20
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.5416 - loss: 0.2043 - val_accuracy: 0.5480 - val_loss: 0.1959
Epoch 5/20
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.5509 - loss: 0.1944 - val_accuracy: 0.5611 - val_loss: 0.1876
Epoch 6/20
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.5730 - loss: 0.1833 - val_accuracy: 0.5953 - val_loss: 0.1718
Epoch 7/20
[1m

In [None]:
# latent_dim = 14     # Bottleneck size (29 / 2)
# # This is where the model must store the most essential information to reconstruct the 
# # input later.
# intermediate_dim = 24 # a hidden layer that maps 29 features to 24

# # Define the ENCODER Network
# # Input Layer 
# input_layer = Input(shape=(input_dim,), name='Input_Layer') # note the comma
# # Declares the model will receive a 29-dimensional vector per transaction

# # Compressed Layer 1
# # A fully connected layer for compression: 29 → 24 with ReLU nonlinearity
# encoded = Dense(intermediate_dim, activation='relu', name='Encoder_L1')(input_layer)

# # Latent Representation (Bottleneck) 
# # selects the data that trully matters- noise is ignored
# latent_representation = Dense(latent_dim, activation='relu', name='Latent_Representation')(encoded)

# # 29 → 24 → 14
# print("Encoder defined.")

Encoder defined.


In [None]:
# # Define the DECODER Network
# # Decompressed Layer 1 (14->24) (Symmetrical to Encoder_L1)
# decoded = Dense(intermediate_dim, activation='relu', name='Decoder_L1')(latent_representation)

# # Output Layer (24->29) (Must match the Input Dimension)
# output_layer = Dense(input_dim, activation='linear', name='Output_Reconstruction')(decoded)

# # ---------------------------------------

# # Create the Full Autoencoder Model
# autoencoder = Model(inputs=input_layer, outputs=output_layer, name='Anomaly_Autoencoder')

# print("Decoder and Full Autoencoder Model defined.")

Decoder and Full Autoencoder Model defined.


In [None]:

# autoencoder.compile(
#     optimizer=Adam(learning_rate=0.001),
#     loss='mse', # Mean Squared Error is the metric for reconstruction quality
#     metrics=['accuracy']
# )

# # Display the model architecture
# autoencoder.summary()

In [None]:
# Get reconstructions for the entire scaled dataset (normal and fraud)
# returns models reconstructed data
reconstructions = autoencoder.predict(X_scaled)

# Calculate the Mean Squared Error (MSE) for each transaction
mse = np.mean(np.square(X_scaled - reconstructions), axis = 1)

# Store results in a DataFrame for easy analysis
error_df = pd.DataFrame({"Reconstruction_Error": mse, 'True_Class': y})

[1m8901/8901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step


In [None]:
# to decide how much error is too much
fraud_errors = error_df[error_df['True_Class'] == 1]
normal_errors = error_df[error_df['True_Class'] == 0]
print(fraud_errors.tail())
print("\n")
print(normal_errors.tail())

        Reconstruction_Error  True_Class
279863              5.554705           1
280143              2.921362           1
280149              2.993596           1
281144              5.711470           1
281674              0.024913           1


        Reconstruction_Error  True_Class
284802              0.167559           0
284803              0.139643           0
284804              0.020840           0
284805              0.589458           0
284806              0.122020           0


In [None]:
# Extract the normal (non-fraudulent) reconstruction errors
normal_error = error_df[error_df['True_Class'] == 0].Reconstruction_Error
# print(normal_error)

# 1. Set Anomaly Threshold
# Use the 95th percentile of the reconstruction error from NORMAL transactions
threshold = np.percentile(normal_error, 95)
print(f"Calculated Anomaly Threshold: {threshold:.6f}")

# 2. Predict anomalies for the entire dataset
# The prediction is TRUE (1 or Fraud) if the error is above the threshold
predicted_anomalies = error_df['Reconstruction_Error'] > threshold

Calculated Anomaly Threshold: 0.424017


In [36]:
print("\nConfusion Matrix")
print(confusion_matrix(error_df['True_Class'], predicted_anomalies))


Confusion Matrix
[[270099  14216]
 [    78    414]]


In [None]:
# Calculate and print Precision for the minority class
precision = precision_score(error_df['True_Class'], predicted_anomalies, pos_label = 1)

# Calculate and print Recall for the minority class (pos_label=1)
recall = recall_score(error_df['True_Class'], predicted_anomalies, pos_label = 1)

print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")

Precision: 2.83%
Recall: 84.15%


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [3]:
df = pd.read_csv(r"C:\Users\hp\OneDrive\Desktop\LP4\creditcard.csv")

X = df.drop(['Time', 'Class'], axis = 1).values
y = df['Class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
input_dim = X_scaled.shape[1]
X_normal = X_scaled[y == 0]
X_train_normal, X_val_normal = train_test_split(X_normal, train_size=0.2, random_state=42)

In [12]:
latent_dim = 14
intermediate_dim = 24
input_layer = Input(shape=(input_dim,), name = "Input_Layer")
encoded = Dense(latent_dim, activation="relu", name = "Encoded_L1")(input_layer)
latent_representation = Dense(intermediate_dim, activation="relu", name = "Latent_Representation")(encoded)

decoded = Dense(intermediate_dim, activation="relu", name = "Decoded_L1")(latent_representation)
output_layer = Dense(input_dim, activation="linear", name = "Output_Layer")(decoded)

In [None]:
autoencoder = Model(inputs = input_layer, outputs = output_layer, name = "AnomalyAutoencoder")

adam = Adam(0.001)
autoencoder.compile(loss = "mse", optimizer=adam, metrics=["accuracy"])

H = autoencoder.fit(X_train_normal, X_train_normal, validation_data=(X_val_normal, X_val_normal), epochs = 20, batch_size = 128)

Epoch 1/20
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.3888 - loss: 0.4929 - val_accuracy: 0.5166 - val_loss: 0.3269
Epoch 2/20
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5417 - loss: 0.2912 - val_accuracy: 0.5504 - val_loss: 0.2748
Epoch 3/20
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.5580 - loss: 0.2544 - val_accuracy: 0.5774 - val_loss: 0.2266
Epoch 4/20
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5843 - loss: 0.2162 - val_accuracy: 0.5932 - val_loss: 0.2090
Epoch 5/20
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5917 - loss: 0.2070 - val_accuracy: 0.5911 - val_loss: 0.2057
Epoch 6/20
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5967 - loss: 0.1988 - val_accuracy: 0.5899 - val_loss: 0.1947
Epoch 7/20
[1m1

In [25]:
reconstructions = autoencoder.predict(X_scaled)
mse = np.mean(np.square(X_scaled - reconstructions), axis=1)
error_df = pd.DataFrame({"Reconstruction_Error": mse, "True_Class": y})

[1m8901/8901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 444us/step


In [26]:
fraud_errors = error_df[error_df["True_Class"] == 1]
normal_errors = error_df[error_df["True_Class"] == 0]

In [None]:
normal_error = error_df[error_df["True_Class"] == 0].Reconstruction_Error

In [29]:
threshold = np.percentile(normal_error, 95)
predicted_anomalies = error_df['Reconstruction_Error'] > threshold

In [30]:
print(confusion_matrix(error_df["True_Class"], predicted_anomalies))

[[270099  14216]
 [    71    421]]


In [32]:
precision = precision_score(error_df["True_Class"], predicted_anomalies)
print(f"Precision: {precision * 100:.4f}")

Precision: 2.8763


In [33]:
recall = recall_score(error_df["True_Class"], predicted_anomalies)
print(f"Recall: {recall*100:.2f}")

Recall: 85.57


In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, Dense

In [16]:
df = pd.read_csv(r"C:\Users\hp\OneDrive\Desktop\LP4\creditcard.csv")

X = df.drop(['Time', 'Class'], axis = 1).values
y = df['Class']

scaled = StandardScaler()
X_scaled = scaled.fit_transform(X)

In [19]:
input_dim = X_scaled.shape[1]
X_normal = X_scaled[y == 0]
X_train_normal, X_val_normal = train_test_split(X_normal, train_size = 0.2, random_state = 42)

In [25]:
latent_dim = 14
intermediate_dim = 24
input_layer = Input(shape = (input_dim, ), name = "Input_Layer")
encoded = Dense(latent_dim, activation = 'relu', name = "Encoded_L1")(input_layer)
latent_representation = Dense(intermediate_dim, activation = "relu", name = "Latent_Representation")(encoded)
decoded = Dense(intermediate_dim, activation = "relu", name = "Decoded_L1")(latent_representation)
output_layer = Dense(input_dim, activation = "linear", name = "Output_Layer")(decoded)

In [29]:
autoencoder = Model(inputs = input_layer, outputs = output_layer)
adam = Adam(0.001)
autoencoder.compile(loss = "mse", optimizer = adam, metrics = ["accuracy"])
H = autoencoder.fit(X_train_normal, X_train_normal, validation_data = (X_val_normal, X_val_normal), epochs = 20, batch_size = 128)

Epoch 1/20
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.2428 - loss: 0.7334 - val_accuracy: 0.3694 - val_loss: 0.5426
Epoch 2/20
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.4140 - loss: 0.4737 - val_accuracy: 0.4500 - val_loss: 0.4111
Epoch 3/20
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.4626 - loss: 0.3752 - val_accuracy: 0.4806 - val_loss: 0.3394
Epoch 4/20
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.4891 - loss: 0.3197 - val_accuracy: 0.4972 - val_loss: 0.3010
Epoch 5/20
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.5153 - loss: 0.2880 - val_accuracy: 0.5290 - val_loss: 0.2764
Epoch 6/20
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.5381 - loss: 0.2656 - val_accuracy: 0.5414 - val_loss: 0.2594
Epoch 7/20
[1m445/445[0m 

In [36]:
reconstructions = autoencoder.predict(X_scaled)
mse = np.mean(np.square(X_scaled - reconstructions), axis = 1)
error_df = pd.DataFrame({"Reconstruction_Error": mse, "True_Class": y})

[1m8901/8901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 545us/step


In [39]:
fraud_errors = error_df[error_df["True_Class"] == 1]
normal_errors = error_df[error_df["True_Class"] == 0]
normal_error = error_df[error_df["True_Class"] == 0].Reconstruction_Error

In [42]:
threshold = np.percentile(normal_error, 95)
predicted_anomalies = error_df["Reconstruction_Error"] > threshold

In [44]:
print(confusion_matrix(error_df["True_Class"], predicted_anomalies))

[[270099  14216]
 [    71    421]]
