In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, precision_score

RANDOM_SEED = 2021
TEST_PCT = 0.3
LABELS = ['Normal', 'Fraud']

# Load dataset
dataset = pd.read_csv("Learning.csv")

# Check for nulls
print("Any nulls in the dataset:", dataset.isnull().values.any())

# Check unique labels
print("No. of unique labels:", len(dataset["Class"].unique()))

# Breakdown of transaction types
print("Breakdown of the Normal and Fraud Transactions:")
print(pd.value_counts(dataset["Class"], sort=True))

# Class distribution plot
count_classes = pd.value_counts(dataset["Class"], sort=True)
count_classes.plot(kind='bar', rot=0)
plt.xticks(range(len(dataset["Class"].unique())), dataset["Class"].unique())
plt.title("Frequency by Observation Number")
plt.xlabel("Class")
plt.ylabel("Number of Observations")
plt.show()

# Split the data for analysis
normal_dataset = dataset[dataset["Class"] == 0]
fraud_dataset = dataset[dataset["Class"] == 1]

# Plot transaction amount distribution
bins = np.linspace(200, 2500, 100)
plt.figure(figsize=(10, 6))
plt.hist(normal_dataset["Amount"], bins=bins, alpha=1, density=True, label='Normal')
plt.hist(fraud_dataset["Amount"], bins=bins, alpha=0.5, density=True, label='Fraud')
plt.legend(loc='upper right')
plt.title('Transaction Amount vs Percentage of Transactions')
plt.xlabel('Transaction Amount USD')
plt.ylabel('Percentage of Transactions')
plt.show()

# Preprocess time and amount fields
sc = StandardScaler()
dataset["Time"] = sc.fit_transform(dataset["Time"].values.reshape(-1, 1))
dataset["Amount"] = sc.fit_transform(dataset["Amount"].values.reshape(-1, 1))

# Prepare rawdata and label arrays
raw_data = dataset.values
labels = raw_data[:, -1]
data = raw_data[:, 0:-1]

# Train/test split
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.2, random_state=2021)

# Normalize data
min_val = tf.reduce_min(train_data)
max_val = tf.reduce_max(train_data)
train_data = (train_data - min_val) / (max_val - min_val)
test_data = (test_data - min_val) / (max_val - min_val)
train_data = tf.cast(train_data, tf.float32)
test_data = tf.cast(test_data, tf.float32)

train_labels = train_labels.astype(bool)
test_labels = test_labels.astype(bool)

normal_train_data = train_data[train_labels == 0]
fraud_train_data = train_data[train_labels == 1]
normal_test_data = test_data[test_labels == 0]
fraud_test_data = test_data[test_labels == 1]

print("No. of records in Fraud Train Data:", len(fraud_train_data))
print("No. of records in Normal Train Data:", len(normal_train_data))
print("No. of records in Fraud Test Data:", len(fraud_test_data))
print("No. of records in Normal Test Data:", len(normal_test_data))

# Autoencoder model definition
input_dim = data.shape[1]
encoding_dim = 14
hidden_dim1 = 32
hidden_dim2 = 16

input_layer = tf.keras.layers.Input(shape=(input_dim,))
encoder = tf.keras.layers.Dense(encoding_dim, activation="tanh")(input_layer)
encoder = tf.keras.layers.Dropout(0.2)(encoder)
encoder = tf.keras.layers.Dense(hidden_dim1, activation="relu")(encoder)
encoder = tf.keras.layers.Dense(hidden_dim2, activation=tf.nn.leaky_relu)(encoder)

decoder = tf.keras.layers.Dense(hidden_dim1, activation="relu")(encoder)
decoder = tf.keras.layers.Dropout(0.2)(decoder)
decoder = tf.keras.layers.Dense(encoding_dim, activation="relu")(decoder)
decoder = tf.keras.layers.Dense(input_dim, activation="tanh")(decoder)

autoencoder = tf.keras.Model(inputs=input_layer, outputs=decoder)
autoencoder.summary()

# Model checkpoint and early stopping
cp = tf.keras.callbacks.ModelCheckpoint(filepath="autoencoder_fraud.h5", monitor='val_loss', mode='min', verbose=2, save_best_only=True)
earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=10, verbose=1, mode='min', restore_best_weights=True)

autoencoder.compile(optimizer='adam', loss='mean_squared_error', metrics=[tf.keras.metrics.MeanSquaredError()])

# Train autoencoder
history = autoencoder.fit(normal_train_data, normal_train_data,
                         epochs=10, batch_size=32,
                         shuffle=True, validation_data=(test_data, test_data),
                         verbose=1, callbacks=[cp, earlystop]).history

# Plot model loss
plt.plot(history["loss"], linewidth=2, label="Train")
plt.plot(history["val_loss"], linewidth=2, label="Test")
plt.legend(loc="upper right")
plt.title("Model Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.show()

# Get reconstructed output and calculate error
test_x_predictions = autoencoder.predict(test_data)
mse = np.mean(np.power(test_data - test_x_predictions, 2), axis=1)
error_df = pd.DataFrame({'Reconstruction_error': mse, 'True_class': test_labels})

# Plot errors for Normal vs Fraud
threshold_fixed = 0.00005
groups = error_df.groupby('True_class')
fig, ax = plt.subplots(figsize=(12, 6))
for name, group in groups:
    ax.scatter(group.index, group.Reconstruction_error, marker="o", s=20, label="Fraud" if name else "Normal", alpha=0.6)
ax.axhline(y=threshold_fixed, color="green", linewidth=2, linestyle='--', label="Threshold")
ax.legend()
plt.title('Reconstruction Error for Normal and Fraud Data', fontsize=14)
plt.ylabel('Reconstruction Error', fontsize=12)
plt.xlabel('Data Point Index', fontsize=12)
ax.set_ylim([0, max(max(error_df.Reconstruction_error), threshold_fixed)*1.1])
plt.show()

# Confusion matrix and metrics
error_df["pred"] = [1 if e > threshold_fixed else 0 for e in error_df.Reconstruction_error.values]
conf_matrix = confusion_matrix(error_df.True_class, error_df["pred"])
plt.figure(figsize=(4, 4))
sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d")
plt.title("Confusion matrix")
plt.ylabel("True class")
plt.xlabel("Predicted class")
plt.show()

print("Accuracy:", accuracy_score(error_df.True_class, error_df["pred"]))
print("Recall:", recall_score(error_df.True_class, error_df["pred"]))
print("Precision:", precision_score(error_df.True_class, error_df["pred"]))


FileNotFoundError: [Errno 2] No such file or directory: 'Learning.csv'

In [2]:
!pip install seaborn 

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
