In [None]:
# Install seaborn library
# seaborn: Statistical data visualization library
# - Built on top of matplotlib
# - Provides enhanced visualization capabilities
# - Used for creating informative statistical graphics
pip install seaborn

In [None]:
# Import required libraries for ECG anomaly detection
# Purpose: Set up all necessary Python libraries for building an autoencoder

# pandas (pd): Data manipulation and analysis
# - Used for reading and processing ECG data

# numpy (np): Numerical computing library
# - Used for array operations and mathematical calculations

# matplotlib.pyplot (plt): Plotting library
# - Used for visualizing ECG signals and results

# tensorflow (tf): Deep learning framework
# - Core library for building neural networks

# seaborn (sns): Statistical visualization
# - Enhanced plotting for statistical graphics

# Keras Components:
# Sequential: For creating layer-by-layer neural network
# Dense: Fully connected neural network layer
# Input: For specifying input shape
# Dropout: For preventing overfitting

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout

In [None]:
# Load and preprocess ECG data
# Purpose: Read ECG data from CSV and prepare it for analysis

# 1. Read CSV file containing ECG signals
# 2. Remove any rows with missing values using dropna()
# 3. Drop unnecessary column '1' using drop()
# - axis=1 indicates column removal (axis=0 would be row removal)

df = pd.read_csv('ecg.csv').dropna()
df = df.drop(['1'], axis=1)
df

In [None]:
# Standardize the ECG data
# Purpose: Scale the data to have zero mean and unit variance

# StandardScaler: Standardization transformer
# - Removes the mean (centering)
# - Scales to unit variance
# - Important for neural network training stability

# fit_transform:
# - Learns the scaling parameters from the data
# - Applies the transformation to the data

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
df = scaler.fit_transform(df)
df

In [None]:
# Split data into training and testing sets
# Purpose: Create separate datasets for training and validation

# train_test_split parameters:
# - test_size=0.2: 20% of data for testing, 80% for training
# - random_state=42: Seed for reproducible results
x_train,x_test = train_test_split(df,test_size=0.2,random_state=42)

In [None]:
# Display dataset shapes
# Purpose: Verify the dimensions of training and testing sets

# x_train.shape: Shows dimensions of training data
# x_test.shape: Shows dimensions of testing data
# Helps verify the split ratio and data dimensions
print(x_train.shape)
# print(y_train.shape)  # Commented out as this is unsupervised learning
print(x_test.shape)

In [None]:
# Build Autoencoder Model
# Purpose: Create a symmetric autoencoder for anomaly detection

model = Sequential([
    # Input Layer
    Input(shape=(x_train.shape[1],)),  # Match input shape to data dimensions
    
    # Encoder Layers
    Dense(64, activation='relu'),  # Compress to 64 dimensions
    Dense(32, activation='relu'),  # Further compress to 32 dimensions
    Dense(16, activation='relu'),  # Bottleneck layer - 16 dimensions
    
    # Decoder Layers - Mirror the encoder architecture
    Dense(32, activation='relu'),  # Start expanding back
    Dense(64, activation='relu'),  # Continue expanding
    Dense(x_train.shape[1], activation='linear')  # Output layer matching input dimensions
])

# Compile the model:
# optimizer='adam': Adaptive learning rate optimization
# loss='mse': Mean Squared Error loss for reconstruction
model.compile(optimizer='adam', loss='mse')

# Display model architecture summary
model.summary()

In [None]:
# Train the Autoencoder
# Purpose: Train the model to reconstruct normal ECG patterns

# Parameters:
# x_train, x_train: Same data for input and target (autoencoder property)
# epochs=50: Number of complete passes through the training data
# batch_size=30: Number of samples per gradient update
# validation_data: Test data for monitoring performance
# shuffle=True: Randomize batch order in each epoch

model.fit(
    x_train,
    x_train,
    epochs=50,
    batch_size=30,
    validation_data=(x_test,x_test),
    shuffle=True
)

In [None]:
# Visualize Training History
# Purpose: Plot training and validation loss over epochs

# sns.lineplot: Create line plot of training metrics
# model.history.history: Contains loss values for each epoch
import seaborn as sns
sns.lineplot(model.history.history)

In [None]:
# Calculate Reconstruction Error and Set Threshold
# Purpose: Identify anomalies based on reconstruction performance

# 1. Get model predictions (reconstructions)
predictions = model.predict(x_test)

# 2. Calculate Mean Squared Error between original and reconstructed data
# np.power(x_test - predictions, 2): Square the differences
# np.mean(..., axis=1): Average across features for each sample
mse = np.mean(np.power(x_test - predictions, 2), axis=1)

# 3. Set threshold at 95th percentile of MSE values
# Samples with MSE > threshold will be considered anomalies
threshold = np.percentile(mse, 95)  # Top 5% are anomalies
threshold

In [None]:
# Create Boolean Mask for Anomalies
# Purpose: Identify anomalous ECG signals

# mse > threshold returns boolean array:
# - True: Sample is an anomaly (MSE > threshold)
# - False: Sample is normal (MSE <= threshold)
anomalies = mse > threshold

In [None]:
# Count Total Anomalies
# Purpose: Quantify the number of detected anomalous ECG signals

# np.sum(anomalies): Count True values in boolean array
# This represents the total number of anomalous samples
num_anomalies = np.sum(anomalies)
print(f"Number of Anomalies: {num_anomalies}")

In [None]:
# Visualize Reconstruction Errors
# Purpose: Plot MSE distribution and anomaly threshold

# Plot components:
# - Blue dots: MSE for each sample
# - Red line: Anomaly threshold
# - Points above red line are anomalies

import matplotlib.pyplot as plot

plt.plot(mse, marker='o', linestyle='', markersize=3, label='MSE')
plt.axhline(threshold, color='r', linestyle='--', label='Anomaly Threshold')
plt.xlabel('Sample Index')
plt.ylabel('MSE')
plt.title('Anomaly Detection Results')
plt.legend()
plt.show()

In [None]:
# Visualize Normal ECG Reconstruction
# Purpose: Compare original and reconstructed ECG signals

# Plot parameters:
# figsize=(12, 6): Set figure dimensions
# x_test[0]: Original signal (first sample)
# predictions[0]: Reconstructed signal
# Shows how well the autoencoder reconstructs normal patterns

plt.figure(figsize=(12, 6))
plt.plot(x_test[0], label='Original ECG')
plt.plot(predictions[0], label='Reconstructed ECG')
plt.xlabel('Time')
plt.ylabel('Amplitude')
plt.legend()
plt.title('Normal ECG')
plt.show()

In [None]:
# Create Confusion Matrix
# Purpose: Visualize the classification results

# Note: In unsupervised anomaly detection, we're comparing
# predictions against themselves to visualize distribution

import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Create and plot confusion matrix
sns.heatmap(confusion_matrix(anomalies, anomalies), annot=True, fmt='d')
plt.xlabel("Predicted label", fontsize=14)
plt.ylabel("True label", fontsize=14)
plt.title("Confusion Matrix", fontsize=14)
plt.show()