In [1]:
!pip install torch pandas numpy scikit-learn matplotlib
!pip install torch torchvision torchaudio

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from pathlib import Path

# Set directories
train_dir = 'http://localhost:8888/lab/tree/train'
test_dir = 'http://localhost:8888/lab/tree/test1'

# Verify files in directories
train_files = list(Path(train_dir).glob("*.csv"))
test_files = list(Path(test_dir).glob("*.csv"))

print("Train files:", train_files)
print("Test files:", test_files)

# Load datasets
def dataframe_from_csv(target):
    return pd.read_csv(target).rename(columns=lambda x: x.strip())

def dataframe_from_csvs(targets):
    if not targets:
        raise ValueError("No CSV files found in the directory.")
    return pd.concat([dataframe_from_csv(x) for x in targets])

# Get dataset files
TEST_DATASET = sorted([x for x in Path(test_dir).glob("*.csv")])
TRAIN_DATASET = sorted([x for x in Path(train_dir).glob("*.csv")])

# Load dataframes
try:
    TEST_DF_RAW = dataframe_from_csvs(TEST_DATASET)
    TRAIN_DF_RAW = dataframe_from_csvs(TRAIN_DATASET)
except ValueError as e:
    print(e)
    TRAIN_DF_RAW = pd.DataFrame()
    TEST_DF_RAW = pd.DataFrame()

# Check if dataframes are not empty
if not TRAIN_DF_RAW.empty and not TEST_DF_RAW.empty:
    ATTACK_DF = TEST_DF_RAW['attack']
    DROP_FIELD = ["time", "attack_P1", "attack_P2", "attack_P3", "attack"]
    VALID_COLUMNS_IN_TRAIN_DATASET = TRAIN_DF_RAW.columns.drop(DROP_FIELD)

    # Min-Max normalization
    TAG_MIN = TRAIN_DF_RAW[VALID_COLUMNS_IN_TRAIN_DATASET].min()
    TAG_MAX = TRAIN_DF_RAW[VALID_COLUMNS_IN_TRAIN_DATASET].max()

    def normalize(df, TAG_MIN, TAG_MAX):
        ndf = df.copy()
        for c in df.columns:
            if TAG_MIN[c] == TAG_MAX[c]:
                ndf[c] = df[c] - TAG_MIN[c]
            else:
                ndf[c] = (df[c] - TAG_MIN[c]) / (TAG_MAX[c] - TAG_MIN[c])
        return ndf

    # Apply normalization and exponential weighted moving average
    TRAIN_DF = normalize(TRAIN_DF_RAW[VALID_COLUMNS_IN_TRAIN_DATASET], TAG_MIN, TAG_MAX).ewm(alpha=0.9).mean()

    def boundary_check(df):
        x = np.array(df, dtype=np.float32)
        return np.any(x > 1.0), np.any(x < 0), np.any(np.isnan(x))

    print(boundary_check(TRAIN_DF))
    print(TRAIN_DF.shape)
    train = np.array(TRAIN_DF)
    x_train = train.reshape(train.shape[0], 1, train.shape[1])
    print(len(TEST_DF_RAW))
    TEST_DF_RAW = TEST_DF_RAW.dropna()
    print(len(TEST_DF_RAW))
else:
    print("No training or testing data found. Please upload the files and try again.")

# Define sliding window function
window_size = 60
label_size = 100000

def sliding_window_unsupervised(df, window_size, feature_columns, answer_column):
    data = df[feature_columns].values
    answers = answer_column.values

    num_samples = len(df) - window_size
    features = np.empty((num_samples, window_size, len(feature_columns)), dtype=np.float32)
    targets = np.empty((num_samples, len(feature_columns)), dtype=np.float32)
    answer_targets = np.empty(num_samples, dtype=int)

    for i in range(num_samples):
        features[i] = data[i:i+window_size]
        targets[i] = data[i+window_size]
        answer_targets[i] = 1 if np.any(answers[i:i+window_size] == 1) else 0

    return features, targets, answer_targets

feature_columns = ['P1_B2004', 'P1_B2016', 'P1_B3004', 'P1_B3005', 'P1_B4002', 'P1_B4005', 'P1_B400B',
                   'P1_B4022', 'P1_FCV01D', 'P1_FCV01Z', 'P1_FCV02D', 'P1_FCV02Z', 'P1_FCV03D',
                   'P1_FCV03Z', 'P1_FT01', 'P1_FT01Z', 'P1_FT02', 'P1_FT02Z', 'P1_FT03', 'P1_FT03Z',
                   'P1_LCV01D', 'P1_LIT01', 'P1_PCV01D', 'P1_PCV01Z', 'P1_PCV02D', 'P1_PCV02Z',
                   'P1_PIT01', 'P1_PIT02', 'P1_TIT01', 'P1_TIT02']

features, targets, answers = sliding_window_unsupervised(TRAIN_DF[:label_size], 60, feature_columns, ATTACK_DF[:label_size])
print(features.shape)
print(targets.shape)
print(answers.shape)

features_train, features_test, targets_train, targets_test, labels_train, labels_test = train_test_split(features, targets, answers, test_size=0.2, random_state=42)

# Define LSTM autoencoder
class LLaMA3Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, seq_len):
        super(LLaMA3Autoencoder, self).__init__()
        self.encoder = nn.Linear(input_dim, hidden_dim)
        self.decoder = nn.Linear(hidden_dim, input_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, batch_first=True)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        x = self.encoder(x)
        x, _ = self.lstm(x)
        x = self.decoder(x)
        return x

# Set hyperparameters
input_dim = len(feature_columns)
hidden_dim = 64
seq_len = window_size
lr = 0.001
epochs = 50
batch_size = 64

# Initialize the model
model = LLaMA3Autoencoder(input_dim, hidden_dim, seq_len)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.MSELoss()

# Convert data to tensors
features_train = torch.tensor(features_train, dtype=torch.float32)
features_test = torch.tensor(features_test, dtype=torch.float32)

# Training loop
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(features_train)
    loss = criterion(outputs, features_train)
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

# Evaluation
model.eval()
with torch.no_grad():
    predicted = model(features_test).numpy()

# Plot actual vs predicted values
num_plots = 5  # Number of samples to plot

plt.figure(figsize=(15, num_plots * 5))
for i in range(num_plots):
    plt.subplot(num_plots, 1, i+1)
    plt.plot(features_test[i].numpy().flatten(), label='Actual')
    plt.plot(predicted[i].flatten(), label='Predicted')
    plt.legend()
    plt.title(f'Sample {i+1}')
plt.tight_layout()
plt.show()

# Additional evaluation metrics
mse = np.mean((predicted - features_test.numpy()) ** 2)
print(f'Mean Squared Error: {mse}')


Train files: [PosixPath('/content/train/train1.csv'), PosixPath('/content/train/train2.csv')]
Test files: [PosixPath('/content/test1/test1.csv'), PosixPath('/content/test1/test2.csv')]
(False, False, False)
(317203, 59)
312283
312282
(99940, 60, 30)
(99940, 30)
(99940,)
