ANOMALY DETECTION PROJECT TEMPLATE
===================================
Use Case: Fraud Detection, Network Intrusion, Equipment Failure Prediction

# 1. PROJECT SETUP & ENVIRONMENT

## 1.1 Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    precision_recall_curve, roc_auc_score, roc_curve,
    f1_score, precision_score, recall_score
)

# Deep Learning
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
# Set random seeds
np.random.seed(42)
torch.manual_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1.2 Configuration

In [None]:
CONFIG = {
    'data_path': 'data.csv',
    'contamination': 0.05,  # Expected proportion of anomalies
    'test_size': 0.2,
    'batch_size': 128,
    'learning_rate': 0.001,
    'num_epochs': 100,
    'latent_dim': 8,
    'threshold_percentile': 95,
    'random_seed': 42
}

# 2. DATA LOADING & EXPLORATION

## 2.1 Load Data

In [None]:
# Load dataset
df = pd.read_csv(CONFIG['data_path'])

print(f"Dataset shape: {df.shape}")
print(f"\nColumn types:")
print(df.dtypes)
print(f"\nFirst few rows:")
df.head()

## 2.2 Exploratory Data Analysis

In [None]:
# Basic statistics
print(df.describe())

# Check for missing values
print(f"\nMissing values:\n{df.isnull().sum()}")

# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")

In [None]:
# Distribution of features
numeric_cols = df.select_dtypes(include=[np.number]).columns

fig, axes = plt.subplots(len(numeric_cols)//4 + 1, 4, figsize=(20, 5*len(numeric_cols)//4))
axes = axes.ravel()

for idx, col in enumerate(numeric_cols):
    axes[idx].hist(df[col].dropna(), bins=50, edgecolor='black')
    axes[idx].set_title(f'Distribution of {col}')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')

for idx in range(len(numeric_cols), len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Box plots for outlier visualization
fig, axes = plt.subplots(len(numeric_cols)//4 + 1, 4, figsize=(20, 5*len(numeric_cols)//4))
axes = axes.ravel()

for idx, col in enumerate(numeric_cols):
    axes[idx].boxplot(df[col].dropna())
    axes[idx].set_title(f'Box Plot: {col}')
    axes[idx].set_ylabel(col)

for idx in range(len(numeric_cols), len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

# 3. DATA PREPROCESSING

## 3.1 Handle Missing Values

In [None]:
# Fill or drop missing values
df_clean = df.fillna(df.median(numeric_only=True))

# Alternative: forward fill for time series
# df_clean = df.fillna(method='ffill')

print(f"Missing values after cleaning: {df_clean.isnull().sum().sum()}")

## 3.2 Feature Engineering

In [None]:
def create_features(df):
    """Create additional features for anomaly detection"""
    df = df.copy()
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    # Statistical features
    for col in numeric_cols:
        # Rolling statistics
        df[f'{col}_rolling_mean_3'] = df[col].rolling(window=3, min_periods=1).mean()
        df[f'{col}_rolling_std_3'] = df[col].rolling(window=3, min_periods=1).std()
        
        # Lag features
        df[f'{col}_lag_1'] = df[col].shift(1)
        
        # Difference
        df[f'{col}_diff'] = df[col].diff()
    
    return df.fillna(0)

df_engineered = create_features(df_clean)
print(f"Features after engineering: {df_engineered.shape[1]}")

## 3.3 Scaling

In [None]:
# Separate features and (if available) labels
feature_cols = df_engineered.select_dtypes(include=[np.number]).columns.tolist()

# If you have labels (supervised case)
if 'label' in df_engineered.columns:
    X = df_engineered[feature_cols].drop('label', axis=1)
    y = df_engineered['label']
    supervised = True
else:
    X = df_engineered[feature_cols]
    y = None
    supervised = False

# Scale features
scaler = RobustScaler()  # Robust to outliers
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

print(f"Feature matrix shape: {X_scaled.shape}")

# 4. TRAIN-TEST SPLIT

In [None]:
if supervised:
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=CONFIG['test_size'], 
        random_state=CONFIG['random_seed'], stratify=y
    )
else:
    # For unsupervised, split without labels
    split_idx = int(len(X_scaled) * (1 - CONFIG['test_size']))
    X_train = X_scaled.iloc[:split_idx]
    X_test = X_scaled.iloc[split_idx:]
    y_train, y_test = None, None

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")

# 5. UNSUPERVISED ANOMALY DETECTION METHODS

## 5.1 Isolation Forest

In [None]:
iso_forest = IsolationForest(
    contamination=CONFIG['contamination'],
    random_state=CONFIG['random_seed'],
    n_estimators=100
)

iso_forest.fit(X_train)
iso_pred_train = iso_forest.predict(X_train)
iso_pred_test = iso_forest.predict(X_test)

# Convert to binary (1: normal, -1: anomaly -> 0: normal, 1: anomaly)
iso_pred_train = (iso_pred_train == -1).astype(int)
iso_pred_test = (iso_pred_test == -1).astype(int)

print(f"Isolation Forest - Training anomalies: {iso_pred_train.sum()}")
print(f"Isolation Forest - Test anomalies: {iso_pred_test.sum()}")

## 5.2 One-Class SVM

In [None]:
ocsvm = OneClassSVM(nu=CONFIG['contamination'], kernel='rbf', gamma='auto')
ocsvm.fit(X_train)

ocsvm_pred_train = ocsvm.predict(X_train)
ocsvm_pred_test = ocsvm.predict(X_test)

ocsvm_pred_train = (ocsvm_pred_train == -1).astype(int)
ocsvm_pred_test = (ocsvm_pred_test == -1).astype(int)

print(f"One-Class SVM - Training anomalies: {ocsvm_pred_train.sum()}")
print(f"One-Class SVM - Test anomalies: {ocsvm_pred_test.sum()}")

## 5.3 Local Outlier Factor

In [None]:
lof = LocalOutlierFactor(contamination=CONFIG['contamination'], novelty=True)
lof.fit(X_train)

lof_pred_train = lof.predict(X_train)
lof_pred_test = lof.predict(X_test)

lof_pred_train = (lof_pred_train == -1).astype(int)
lof_pred_test = (lof_pred_test == -1).astype(int)

print(f"LOF - Training anomalies: {lof_pred_train.sum()}")
print(f"LOF - Test anomalies: {lof_pred_test.sum()}")

# 6. AUTOENCODER FOR ANOMALY DETECTION

## 6.1 Autoencoder Architecture

In [None]:
class Autoencoder(nn.Module):
    """Deep Autoencoder for anomaly detection"""
    def __init__(self, input_dim, latent_dim=8):
        super(Autoencoder, self).__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, latent_dim)
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, input_dim)
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Initialize model
input_dim = X_train.shape[1]
autoencoder = Autoencoder(input_dim, CONFIG['latent_dim']).to(device)

print(autoencoder)

## 6.2 Variational Autoencoder (VAE)

In [None]:
class VAE(nn.Module):
    """Variational Autoencoder for anomaly detection"""
    def __init__(self, input_dim, latent_dim=8):
        super(VAE, self).__init__()
        
        # Encoder
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        
        # Latent space
        self.fc_mu = nn.Linear(64, latent_dim)
        self.fc_logvar = nn.Linear(64, latent_dim)
        
        # Decoder
        self.fc3 = nn.Linear(latent_dim, 64)
        self.fc4 = nn.Linear(64, 128)
        self.fc5 = nn.Linear(128, input_dim)
    
    def encode(self, x):
        h = F.relu(self.fc1(x))
        h = F.relu(self.fc2(h))
        return self.fc_mu(h), self.fc_logvar(h)
    
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std
    
    def decode(self, z):
        h = F.relu(self.fc3(z))
        h = F.relu(self.fc4(h))
        return self.fc5(h)
    
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

## 6.3 Training Autoencoder

In [None]:
# Convert to tensors
X_train_tensor = torch.FloatTensor(X_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test.values).to(device)

# Training setup
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=CONFIG['learning_rate'])

# Training loop
train_losses = []
test_losses = []

for epoch in range(CONFIG['num_epochs']):
    # Training
    autoencoder.train()
    optimizer.zero_grad()
    
    outputs = autoencoder(X_train_tensor)
    loss = criterion(outputs, X_train_tensor)
    
    loss.backward()
    optimizer.step()
    
    train_losses.append(loss.item())
    
    # Validation
    autoencoder.eval()
    with torch.no_grad():
        test_outputs = autoencoder(X_test_tensor)
        test_loss = criterion(test_outputs, X_test_tensor)
        test_losses.append(test_loss.item())
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{CONFIG["num_epochs"]}], '
              f'Train Loss: {loss.item():.6f}, Test Loss: {test_loss.item():.6f}')

In [None]:
# Plot training history
plt.figure(figsize=(12, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.title('Autoencoder Training History')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 6.4 Compute Reconstruction Error

In [None]:
autoencoder.eval()

with torch.no_grad():
    # Training set
    train_reconstructions = autoencoder(X_train_tensor)
    train_mse = torch.mean((X_train_tensor - train_reconstructions) ** 2, dim=1)
    train_mse = train_mse.cpu().numpy()
    
    # Test set
    test_reconstructions = autoencoder(X_test_tensor)
    test_mse = torch.mean((X_test_tensor - test_reconstructions) ** 2, dim=1)
    test_mse = test_mse.cpu().numpy()

In [None]:
# Visualize reconstruction error distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(train_mse, bins=50, alpha=0.7, label='Train', edgecolor='black')
plt.hist(test_mse, bins=50, alpha=0.7, label='Test', edgecolor='black')
plt.xlabel('Reconstruction Error (MSE)')
plt.ylabel('Frequency')
plt.title('Distribution of Reconstruction Error')
plt.legend()
plt.yscale('log')

plt.subplot(1, 2, 2)
plt.boxplot([train_mse, test_mse], labels=['Train', 'Test'])
plt.ylabel('Reconstruction Error')
plt.title('Reconstruction Error Box Plot')

plt.tight_layout()
plt.show()

## 6.5 Set Threshold and Detect Anomalies

In [None]:
# Set threshold based on training data
threshold = np.percentile(train_mse, CONFIG['threshold_percentile'])
print(f"Anomaly threshold (MSE): {threshold:.6f}")

# Detect anomalies
ae_pred_train = (train_mse > threshold).astype(int)
ae_pred_test = (test_mse > threshold).astype(int)

print(f"Autoencoder - Training anomalies: {ae_pred_train.sum()}")
print(f"Autoencoder - Test anomalies: {ae_pred_test.sum()}")

# 7. MODEL EVALUATION (if labels available)

In [None]:
if supervised and y_test is not None:
    print("\n=== ISOLATION FOREST ===")
    print(classification_report(y_test, iso_pred_test))
    print(f"ROC-AUC: {roc_auc_score(y_test, iso_pred_test):.4f}")
    
    print("\n=== ONE-CLASS SVM ===")
    print(classification_report(y_test, ocsvm_pred_test))
    print(f"ROC-AUC: {roc_auc_score(y_test, ocsvm_pred_test):.4f}")
    
    print("\n=== LOCAL OUTLIER FACTOR ===")
    print(classification_report(y_test, lof_pred_test))
    print(f"ROC-AUC: {roc_auc_score(y_test, lof_pred_test):.4f}")
    
    print("\n=== AUTOENCODER ===")
    print(classification_report(y_test, ae_pred_test))
    print(f"ROC-AUC: {roc_auc_score(y_test, ae_pred_test):.4f}")

## 7.1 Confusion Matrices

In [None]:
if supervised and y_test is not None:
    models = {
        'Isolation Forest': iso_pred_test,
        'One-Class SVM': ocsvm_pred_test,
        'LOF': lof_pred_test,
        'Autoencoder': ae_pred_test
    }
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.ravel()
    
    for idx, (name, preds) in enumerate(models.items()):
        cm = confusion_matrix(y_test, preds)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx])
        axes[idx].set_title(f'{name} - Confusion Matrix')
        axes[idx].set_ylabel('True Label')
        axes[idx].set_xlabel('Predicted Label')
    
    plt.tight_layout()
    plt.show()

## 7.2 ROC Curves

In [None]:
if supervised and y_test is not None:
    plt.figure(figsize=(10, 8))
    
    # For models that provide anomaly scores
    iso_scores = iso_forest.score_samples(X_test)
    ocsvm_scores = ocsvm.score_samples(X_test)
    lof_scores = lof.score_samples(X_test)
    ae_scores = -test_mse  # Negative because lower MSE = more normal
    
    models_scores = {
        'Isolation Forest': iso_scores,
        'One-Class SVM': ocsvm_scores,
        'LOF': lof_scores,
        'Autoencoder': ae_scores
    }
    
    for name, scores in models_scores.items():
        fpr, tpr, _ = roc_curve(y_test, scores)
        auc = roc_auc_score(y_test, scores)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.3f})', linewidth=2)
    
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves Comparison')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# 8. ENSEMBLE ANOMALY DETECTION

In [None]:
def ensemble_predictions(predictions_dict, method='majority'):
    """Combine predictions from multiple models"""
    predictions_array = np.array(list(predictions_dict.values()))
    
    if method == 'majority':
        # Majority voting
        ensemble = (predictions_array.sum(axis=0) > len(predictions_dict) / 2).astype(int)
    elif method == 'any':
        # Flag as anomaly if any model detects it
        ensemble = (predictions_array.sum(axis=0) > 0).astype(int)
    elif method == 'all':
        # Flag as anomaly only if all models detect it
        ensemble = (predictions_array.sum(axis=0) == len(predictions_dict)).astype(int)
    
    return ensemble

# Ensemble predictions
ensemble_dict = {
    'iso_forest': iso_pred_test,
    'ocsvm': ocsvm_pred_test,
    'lof': lof_pred_test,
    'autoencoder': ae_pred_test
}

ensemble_pred = ensemble_predictions(ensemble_dict, method='majority')

print(f"Ensemble - Test anomalies: {ensemble_pred.sum()}")

if supervised and y_test is not None:
    print("\n=== ENSEMBLE MODEL ===")
    print(classification_report(y_test, ensemble_pred))
    print(f"ROC-AUC: {roc_auc_score(y_test, ensemble_pred):.4f}")

# 9. ANOMALY ANALYSIS

## 9.1 Analyze Detected Anomalies

In [None]:
# Get anomaly indices
anomaly_indices = np.where(ae_pred_test == 1)[0]

if len(anomaly_indices) > 0:
    # Compare anomalies with normal samples
    anomalies = X_test.iloc[anomaly_indices]
    normals = X_test.iloc[ae_pred_test == 0]
    
    print(f"\nDetected {len(anomaly_indices)} anomalies")
    print("\nAnomaly Statistics:")
    print(anomalies.describe())
    
    print("\nNormal Statistics:")
    print(normals.describe())

## 9.2 Feature Importance for Anomalies

In [None]:
if len(anomaly_indices) > 0:
    # Calculate mean absolute difference from normal samples
    feature_diff = np.abs(anomalies.mean() - normals.mean())
    feature_diff = feature_diff.sort_values(ascending=False)
    
    plt.figure(figsize=(12, 6))
    feature_diff[:15].plot(kind='bar')
    plt.title('Top 15 Features Contributing to Anomalies')
    plt.xlabel('Feature')
    plt.ylabel('Mean Absolute Difference')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 9.3 PCA Visualization

In [None]:
# Reduce to 2D using PCA
pca = PCA(n_components=2)
X_test_pca = pca.fit_transform(X_test)

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.scatter(X_test_pca[ae_pred_test == 0, 0], X_test_pca[ae_pred_test == 0, 1], 
           c='blue', label='Normal', alpha=0.5, s=20)
plt.scatter(X_test_pca[ae_pred_test == 1, 0], X_test_pca[ae_pred_test == 1, 1], 
           c='red', label='Anomaly', alpha=0.8, s=50, marker='x')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.title('Anomaly Detection - PCA Visualization')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(range(len(test_mse)), test_mse, c=ae_pred_test, 
           cmap='RdYlBu_r', alpha=0.6)
plt.axhline(y=threshold, color='r', linestyle='--', label='Threshold')
plt.xlabel('Sample Index')
plt.ylabel('Reconstruction Error')
plt.title('Reconstruction Error Distribution')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 10. REAL-TIME ANOMALY DETECTION

In [None]:
class AnomalyDetector:
    """Production-ready anomaly detector"""
    
    def __init__(self, model, scaler, threshold):
        self.model = model
        self.scaler = scaler
        self.threshold = threshold
        self.model.eval()
    
    def detect(self, data):
        """Detect anomalies in new data"""
        # Preprocess
        data_scaled = self.scaler.transform(data)
        data_tensor = torch.FloatTensor(data_scaled).to(device)
        
        # Get reconstruction error
        with torch.no_grad():
            reconstruction = self.model(data_tensor)
            mse = torch.mean((data_tensor - reconstruction) ** 2, dim=1)
            mse = mse.cpu().numpy()
        
        # Detect anomalies
        is_anomaly = (mse > self.threshold).astype(int)
        
        return is_anomaly, mse

# Initialize detector
detector = AnomalyDetector(autoencoder, scaler, threshold)

# Test on new data
# new_data = pd.DataFrame(...)  # Your new data
# anomalies, scores = detector.detect(new_data)

# 11. MODEL PERSISTENCE

In [None]:
# Save autoencoder
torch.save({
    'model_state_dict': autoencoder.state_dict(),
    'threshold': threshold,
    'config': CONFIG
}, 'anomaly_detector.pth')

# Save scaler
import joblib
joblib.dump(scaler, 'scaler.pkl')

# Save other models
joblib.dump(iso_forest, 'isolation_forest.pkl')
joblib.dump(ocsvm, 'ocsvm.pkl')
joblib.dump(lof, 'lof.pkl')

# 12. CONCLUSIONS & NEXT STEPS

## Summary:
- Best Model: [Model Name]
- Precision: X.XX
- Recall: X.XX
- F1-Score: X.XX
- Detected Anomaly Rate: X.X%

## Next Steps:
- [ ] Implement streaming anomaly detection
- [ ] Add feedback loop for model improvement
- [ ] Implement LSTM Autoencoder for sequence data
- [ ] Deploy as microservice with API
- [ ] Set up alerting system
- [ ] Implement explainable AI for anomalies