In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset

import os

data = pd.read_csv("/Users/xfu/BU Courses Repo/DS 593 Project/spark-analytics/data/1m_health_events_dataset.csv")

# convert the 'Timestamp' column to datetime
data['Timestamp'] = pd.to_datetime(data['Timestamp'])

# extract relevant features from the 'Timestamp' column
data['Hour'] = data['Timestamp'].dt.hour
data['Day'] = data['Timestamp'].dt.day
data['Month'] = data['Timestamp'].dt.month
data['Year'] = data['Timestamp'].dt.year

# one-hot encoding for categorical features
data = pd.get_dummies(data, columns=['EventType', 'Location', 'Severity'])

# separate features and target variable
X = data.drop(['Timestamp', 'Details', 'Is_Anomaly'], axis=1)
y = data['Is_Anomaly']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# create PyTorch datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# create data loaders
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [9]:
# autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# set up the model, loss function, and optimizer
input_dim = X_train_tensor.shape[1]
model = Autoencoder(input_dim).to('cpu')
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100
for epoch in range(num_epochs):
    for batch_features, _ in train_loader:
        # Forward pass
        outputs = model(batch_features)
        loss = criterion(outputs, batch_features)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print the loss for every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# folder path where the model will be saved
folder_path = "/Users/xfu/BU Courses Repo/DS 593 Project/spark-analytics/models"

# save the trained model in our models folder
model_path = os.path.join(folder_path, "anomaly_detection_model.pth")
torch.save(model.state_dict(), model_path)

# anomaly detection on the test set
model.eval()
anomaly_scores = []
with torch.no_grad():
    for batch_features, _ in test_loader:
        outputs = model(batch_features)
        reconstruction_errors = torch.mean((outputs - batch_features)**2, dim=1)
        anomaly_scores.extend(reconstruction_errors.numpy())

# set a threshold for anomaly detection
threshold = np.percentile(anomaly_scores, 95)  # Adjust the threshold as needed

# make predictions based on the anomaly scores
y_pred = [1 if score > threshold else 0 for score in anomaly_scores]

accuracy = np.mean(y_pred == y_test)
precision = np.sum(y_pred * y_test) / np.sum(y_pred)
recall = np.sum(y_pred * y_test) / np.sum(y_test)
f1 = 2 * precision * recall / (precision + recall)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Epoch [10/100], Loss: 0.6115
Epoch [20/100], Loss: 0.5382
Epoch [30/100], Loss: 0.5177
Epoch [40/100], Loss: 0.5194
Epoch [50/100], Loss: 0.5217
Epoch [60/100], Loss: 0.5020
Epoch [70/100], Loss: 0.5131
Epoch [80/100], Loss: 0.5088
Epoch [90/100], Loss: 0.4934
Epoch [100/100], Loss: 0.4859
Accuracy: 0.949795
Precision: 0.0004
Recall: 0.08163265306122448
F1-score: 0.0007960991143397354
