In [1]:
import polars as pl
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:
df_polars = pl.read_parquet('dataset.parquet')

In [3]:
df_polars = df_polars.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars = df_polars.drop_nulls()

In [9]:
X = df_polars.drop('label')
y = df_polars['label']       

In [10]:
X = X.to_numpy()
X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

In [11]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Treinamento

In [12]:
results = []

In [13]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 8),
            nn.ReLU(), 
        )
        self.decoder = nn.Sequential(
            nn.Linear(8, 32),
            nn.ReLU(),
            nn.Linear(32, input_dim) 
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded



In [14]:
def startTrain():
    # Divisão dos dados em treino e teste
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

    # Converter para arrays numpy se necessário
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    # Filtrar dados benignos (classe 0) para treinar o Autoencoder
    X_train = X_train[y_train == 0]

    # Converter os dados para tensores PyTorch
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

    # Criar DataLoaders
    batch_size = 5000
    train_dataset = TensorDataset(X_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Configuração do dispositivo e modelo
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    input_dim = X_train.shape[1]
    model = Autoencoder(input_dim).to(device)

    # Definição da função de perda e otimizador
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Treinamento
    epochs = 25
    start_training = time.time()
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for data in train_loader:
            inputs = data[0].to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, inputs)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * inputs.size(0)

    end_training = time.time()

    # Avaliação do limiar baseado no conjunto de treino
    model.eval()
    reconstruction_errors_train = []
    with torch.no_grad():
        for data in train_loader:
            inputs = data[0].to(device)
            outputs = model(inputs)
            reconstruction_error = torch.mean((outputs - inputs) ** 2, dim=1)
            reconstruction_errors_train.extend(reconstruction_error.cpu().numpy())

    reconstruction_errors_train = np.array(reconstruction_errors_train)
    threshold = np.percentile(reconstruction_errors_train, 95)  # Limiar baseado no percentil 95

    # Avaliação no conjunto de teste
    reconstruction_errors_test = []
    y_pred = []
    with torch.no_grad():
        for data in test_loader:
            inputs, targets = data[0].to(device), data[1].cpu().numpy()
            outputs = model(inputs)
            reconstruction_error = torch.mean((outputs - inputs) ** 2, dim=1).cpu().numpy()
            reconstruction_errors_test.extend(reconstruction_error)
            y_pred.extend((reconstruction_error > threshold).astype(int))

    # Conversão para numpy arrays
    reconstruction_errors_test = np.array(reconstruction_errors_test)
    y_pred = np.array(y_pred)

    # Cálculo de métricas de desempenho
    evaluation_time = time.time()
    confusion = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = confusion.ravel()
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    f1 = f1_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

    training_duration = end_training - start_training
    evaluation_duration = evaluation_time - end_training

    # Registro dos resultados
    results.append([
        "AE", accuracy, balanced_accuracy, precision, recall, specificity, f1, 
        false_alarm_rate, tn, fp, fn, tp, training_duration, evaluation_duration
    ])

In [15]:
for i in range(1,31):
    startTrain()
    print(i)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [16]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy' , 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df

  return dispatch(args[0].__class__)(*args, **kw)


Algorithm,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp,training_duration,evaluation_duration
str,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,f64,f64
"""AE""",0.980831,0.968453,0.990461,0.986673,0.950234,0.988564,0.049766,25376,1329,1864,138001,16.863118,3.077693
"""AE""",0.98301,0.969372,0.990311,0.989447,0.949298,0.989879,0.050702,25351,1354,1476,138389,15.318,3.089932
"""AE""",0.733632,0.820936,0.986252,0.692425,0.949448,0.813623,0.050552,25355,1350,43019,96846,15.180101,3.386056
"""AE""",0.979732,0.968011,0.990548,0.985264,0.950758,0.987899,0.049242,25390,1315,2061,137804,15.262564,2.784892
"""AE""",0.97993,0.967569,0.990289,0.985765,0.949373,0.988022,0.050627,25353,1352,1991,137874,14.001396,2.788628
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""AE""",0.984889,0.971552,0.990823,0.991184,0.951919,0.991004,0.048081,25421,1284,1233,138632,14.089062,2.972274
"""AE""",0.982902,0.969369,0.990338,0.98929,0.949448,0.989813,0.050552,25355,1350,1498,138367,14.833154,2.841163
"""AE""",0.982536,0.969423,0.99046,0.988725,0.950122,0.989592,0.049878,25373,1332,1577,138288,14.06091,2.862068
"""AE""",0.982044,0.968827,0.990314,0.988282,0.949373,0.989297,0.050627,25353,1352,1639,138226,14.509449,2.988646


In [17]:
metrics_df.write_csv(f"metrics_results/unbalanced_AE_metrics_output.csv", separator=';')