In [8]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 1: Load the dataset
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
test_label_df = pd.read_csv("test_label.csv")


# Check for NaN values in the input data
#print("Train data NaN:", np.isnan(train_data).any())
#print("Test data NaN:", np.isnan(test_data).any())

# Print model architecture
#print(model)

#nan_indices = np.argwhere(np.isnan(train_data))
#print("Indices of NaN values in train data:", nan_indices)

# Step 2: Preprocess the data
scaler = StandardScaler()
train_data1 = scaler.fit_transform(train_df.drop(columns=['timestamp_(min)']).values)
test_data = scaler.transform(test_df.drop(columns=['timestamp_(min)']).values)

train_data = np.nan_to_num(train_data1, nan=np.nanmean(train_data1), copy=True)  #train_data_mean_imputed


# Step 3: Implement data augmentation using geometric distribution masks
def geometric_masking(data, p=0.1):
    mask = np.random.geometric(p, size=data.shape)
    masked_data = data * mask
    return masked_data

# Step 4: Build a Transformer-based Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Linear(input_size, hidden_size)
        self.decoder = nn.Linear(hidden_size, input_size)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Step 5: Define contrastive loss
class ContrastiveLoss(nn.Module):
    def __init__(self, margin):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, target):
        distance = torch.dist(output1, output2, p=2)
        loss_contrastive = torch.mean((1 - target) * torch.pow(distance, 2) +
                                       (target) * torch.pow(torch.clamp(self.margin - distance, min=0.0), 2))
        return loss_contrastive

# Step 6: Train the model
def train_model(model, criterion, optimizer, train_loader, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for data in train_loader:
            inputs, _ = data
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, inputs, torch.ones(inputs.size(0)))
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")

# Step 7: Evaluate the model on test data
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for data in test_loader:
            inputs, _ = data
            outputs = model(inputs)
            preds = torch.norm(outputs - inputs, dim=1)
            all_preds.extend(preds.numpy())
    return all_preds

# Step 8: Detect anomalies
def detect_anomalies(predictions, threshold):
    anomalies = [1 if pred > threshold else 0 for pred in predictions]
    return anomalies

# Main function
if __name__ == "__main__":
    # Parameters
    input_size = train_data.shape[1]
    hidden_size = 128
    margin = 1.0
    num_epochs = 10
    batch_size = 25
    learning_rate = 0.001
    threshold = 1.0

    # Apply data augmentation
    augmented_train_data = geometric_masking(train_data)

    # Convert data to PyTorch tensors
    train_tensor = torch.tensor(augmented_train_data, dtype=torch.float32)
    test_tensor = torch.tensor(test_data, dtype=torch.float32)

    # Create datasets and dataloaders
    train_dataset = torch.utils.data.TensorDataset(train_tensor, train_tensor)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataset = torch.utils.data.TensorDataset(test_tensor, torch.zeros(len(test_tensor)))
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model, criterion, and optimizer
    model = Autoencoder(input_size, hidden_size)
    criterion = ContrastiveLoss(margin)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Train the model
    train_model(model, criterion, optimizer, train_loader, num_epochs)

    # Evaluate the model
    predictions = evaluate_model(model, test_loader)

    # Detect anomalies
    anomalies = detect_anomalies(predictions, threshold)

    # Compare with ground truth labels
    true_labels = test_label_df['label'].values
    accuracy = accuracy_score(true_labels, anomalies)
    precision = precision_score(true_labels, anomalies)
    recall = recall_score(true_labels, anomalies)
    f1 = f1_score(true_labels, anomalies)

    print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-score: {f1}")


Epoch 1/10, Loss: 0.0
Epoch 2/10, Loss: 0.0
Epoch 3/10, Loss: 0.0
Epoch 4/10, Loss: 0.0
Epoch 5/10, Loss: 0.0
Epoch 6/10, Loss: 0.0
Epoch 7/10, Loss: 0.0
Epoch 8/10, Loss: 0.0
Epoch 9/10, Loss: 0.0
Epoch 10/10, Loss: 0.0
Accuracy: 0.27755831559294636, Precision: 0.27755831559294636, Recall: 1.0, F1-score: 0.43451373171035984
