In [2]:
import polars as pl
import numpy as np
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [3]:
df_polars = pl.read_parquet('dataset.parquet')

In [4]:
df_polars = df_polars.sample(fraction=0.01, seed=42)

In [5]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [6]:
df_polars = df_polars.drop(["ts", "uid", "id.orig_h", "id.resp_h", "local_orig", "local_resp", "missed_bytes" , "tunnel_parents", "detailed-label", "__index_level_0__"])

In [7]:
X = df_polars.drop('label')
y = df_polars['label']       

In [8]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [10]:
y_train_np = y_train.to_numpy()
y_test_np = y_test.to_numpy()

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_np, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32)

In [11]:
batch_size = 5000
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Treinamento

In [12]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 8),
            nn.ReLU(), 
        )
        self.decoder = nn.Sequential(
            nn.Linear(8, 32),
            nn.ReLU(),
            nn.Linear(32, input_dim) 
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train= torch.tensor(X_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)

input_dim = X_train.shape[1]
model = Autoencoder(input_dim).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

results = []
epochs = 25
print(datetime.now())
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for data in train_loader:
        inputs = data[0].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * inputs.size(0)

    print(f'Epoch {epoch+1}/{epochs}, fim em: {datetime.now()}')
    
    model.eval()
    all_outputs_train = []
    all_targets_train = []
    with torch.no_grad():
        for data in train_loader:
            inputs = data[0].to(device)
            outputs = model(inputs)
            all_outputs_train.append(outputs.cpu())
            all_targets_train.append(inputs.cpu())

    all_outputs_train = torch.cat(all_outputs_train)
    all_targets_train = torch.cat(all_targets_train)
    reconstruction_error_train = torch.mean((all_outputs_train - all_targets_train) ** 2, dim=1).cpu().numpy()

    THRESHOLD_SAMPLE_PERCENTAGE = 1 / 100
    top_n_values_train = np.partition(-reconstruction_error_train, int(round(reconstruction_error_train.shape[0] * THRESHOLD_SAMPLE_PERCENTAGE)))
    threshold = np.median(-top_n_values_train[:int(round(reconstruction_error_train.shape[0] * THRESHOLD_SAMPLE_PERCENTAGE))])

    all_outputs_test = []
    all_targets_test = []
    with torch.no_grad():
        for data in test_loader:
            inputs = data[0].to(device)
            outputs = model(inputs)
            all_outputs_test.append(outputs.cpu())
            all_targets_test.append(inputs.cpu())

    all_outputs_test = torch.cat(all_outputs_test)
    all_targets_test = torch.cat(all_targets_test)
    reconstruction_error_test = torch.mean((all_outputs_test - all_targets_test) ** 2, dim=1).cpu().numpy()

    y_pred = (reconstruction_error_test > threshold).astype(int)
    print(f'Epoch {epoch+1}/{epochs}, avaliada em: {datetime.now()}')

    confusion = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = confusion.ravel()
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    specificity = tn / (tn + fp)
    f1 = f1_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

    results.append([epoch+1, accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp])

2024-09-16 17:49:36.771548
Epoch 1/25, fim em: 2024-09-16 17:49:43.749126
Epoch 1/25, avaliada em: 2024-09-16 17:49:52.956208
Epoch 2/25, fim em: 2024-09-16 17:49:59.288640
Epoch 2/25, avaliada em: 2024-09-16 17:50:08.167649
Epoch 3/25, fim em: 2024-09-16 17:50:15.143226
Epoch 3/25, avaliada em: 2024-09-16 17:50:23.935214
Epoch 4/25, fim em: 2024-09-16 17:50:30.507701
Epoch 4/25, avaliada em: 2024-09-16 17:50:39.221672
Epoch 5/25, fim em: 2024-09-16 17:50:45.487089
Epoch 5/25, avaliada em: 2024-09-16 17:50:54.039023
Epoch 6/25, fim em: 2024-09-16 17:51:00.383458
Epoch 6/25, avaliada em: 2024-09-16 17:51:08.619321
Epoch 7/25, fim em: 2024-09-16 17:51:14.785715
Epoch 7/25, avaliada em: 2024-09-16 17:51:22.968566
Epoch 8/25, fim em: 2024-09-16 17:51:28.988928
Epoch 8/25, avaliada em: 2024-09-16 17:51:37.032843
Epoch 9/25, fim em: 2024-09-16 17:51:43.169623
Epoch 9/25, avaliada em: 2024-09-16 17:51:51.409486
Epoch 10/25, fim em: 2024-09-16 17:51:57.521870
Epoch 10/25, avaliada em: 2024-09-

In [13]:
metrics_df = pl.DataFrame(
    results,
    schema=['epoch', 'Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp']
)
metrics_df

  return dispatch(args[0].__class__)(*args, **kw)


epoch,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp
i64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64
1,0.165078,0.502619,0.982927,0.005763,0.999476,0.011458,0.000524,26691,14,139059,806
2,0.163619,0.499902,0.834348,0.004898,0.994907,0.009738,0.005093,26569,136,139180,685
3,0.161932,0.496747,0.662621,0.003904,0.98959,0.007762,0.01041,26427,278,139319,546
4,0.16192,0.496603,0.658333,0.003954,0.989253,0.00786,0.010747,26418,287,139312,553
5,0.162388,0.497443,0.703791,0.004247,0.990638,0.008443,0.009362,26455,250,139271,594
…,…,…,…,…,…,…,…,…,…,…,…
21,0.159651,0.492647,0.430521,0.002481,0.982812,0.004933,0.017188,26246,459,139518,347
22,0.159717,0.492716,0.437884,0.002545,0.982887,0.005061,0.017113,26248,457,139509,356
23,0.159567,0.492475,0.42203,0.002438,0.982513,0.004848,0.017487,26238,467,139524,341
24,0.15947,0.492342,0.411471,0.002359,0.982325,0.004692,0.017675,26233,472,139535,330
