In [1]:
import polars as pl
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import KFold

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']             
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars = df_polars.drop_nulls()

In [9]:
X = df_polars.drop('label')
y = df_polars['label']       

In [10]:
#X = X.to_numpy()
#X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

# Treinamento

In [11]:
results = []

In [12]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 8),
            nn.ReLU(), 
        )
        self.decoder = nn.Sequential(
            nn.Linear(8, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim) 
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [13]:
def startTrain(X, y, hidden_dim=8, epochs=10, learning_rate=0.001, batch_size = 5000):
    kfold = KFold(n_splits=5, shuffle=True)
    results_fold = []

    for fold, (train_idx, test_idx) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Aplicar MinMaxScaler APENAS aos dados de treino
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        

        rus = RandomUnderSampler()
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train_scaled, y_train)
    
        # Filtrar apenas os dados benignos (classe 0) para treinar o Autoencoder
        X_train_resampled = X_train_resampled[y_train_resampled == 0]
        
        # Converter para tensores PyTorch
        X_train_tensor = torch.tensor(X_train_resampled, dtype=torch.float32)
        X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
        y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

        # Criar DataLoaders
        batch_size = batch_size
        train_dataset = TensorDataset(X_train_tensor)
        test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
        
        # Configuração do dispositivo e modelo
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        input_dim = X_train.shape[1]
        model = Autoencoder(input_dim=input_dim, hidden_dim=hidden_dim).to(device)

        # Definição da função de perda e otimizador
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        # Treinamento
        start_training = time.time()
        for epoch in range(epochs):
            model.train()
            epoch_loss = 0
            for data in train_loader:
                inputs = data[0].to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, inputs)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item() * inputs.size(0)

        end_training = time.time()

        # Avaliação do limiar baseado no conjunto de treino
        model.eval()
        reconstruction_errors_train = []
        with torch.no_grad():
            for data in train_loader:
                inputs = data[0].to(device)
                outputs = model(inputs)
                reconstruction_error = torch.mean((outputs - inputs) ** 2, dim=1)
                reconstruction_errors_train.extend(reconstruction_error.cpu().numpy())

        reconstruction_errors_train = np.array(reconstruction_errors_train)
        threshold = np.percentile(reconstruction_errors_train, 95)  # Limiar baseado no percentil 95

        # Avaliação no conjunto de teste
        reconstruction_errors_test = []
        y_pred = []
        with torch.no_grad():
            for data in test_loader:
                inputs, targets = data[0].to(device), data[1].cpu().numpy()
                outputs = model(inputs)
                reconstruction_error = torch.mean((outputs - inputs) ** 2, dim=1).cpu().numpy()
                reconstruction_errors_test.extend(reconstruction_error)
                y_pred.extend((reconstruction_error > threshold).astype(int))

        # Conversão para numpy arrays
        reconstruction_errors_test = np.array(reconstruction_errors_test)
        y_pred = np.array(y_pred)

        # Cálculo de métricas de desempenho
        evaluation_time = time.time()
        confusion = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = confusion.ravel()
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        f1 = f1_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

        training_duration = end_training - start_training
        evaluation_duration = time.time() - end_training
        avaliacao = [accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp, training_duration, evaluation_duration]
        #print(avaliacao)
        results_fold.append(avaliacao)

    results_fold_array = np.array(results_fold, dtype=np.float32)
    mean_results = np.mean(results_fold_array, axis=0)
    results.append(["AE"] + mean_results.tolist())

In [14]:
#Melhores Hiperparâmetros encontrados: {'hidden_dim': 16, 'epochs': 20, 'learning_rate': 0.001, 'batch_size': 512}

In [15]:
for i in range(1,11):
    startTrain(X, y, hidden_dim=16, epochs=20, learning_rate=0.001, batch_size=512)
    print(i)

1
2
3
4
5
6
7
8
9
10


In [16]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy' , 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df

  return dispatch(args[0].__class__)(*args, **kw)


Algorithm,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp,training_duration,evaluation_duration
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""AE""",0.982494,0.969312,0.990418,0.988711,0.949913,0.989563,0.050087,16911.400391,891.799988,1052.199951,92191.203125,10.481748,1.326094
"""AE""",0.983421,0.969866,0.990429,0.98982,0.949913,0.990124,0.050087,16911.400391,891.799988,949.200012,92294.203125,10.195984,1.42185
"""AE""",0.972992,0.963733,0.990343,0.977378,0.950089,0.983682,0.049911,16914.599609,888.599976,2110.600098,91132.796875,10.615119,1.575993
"""AE""",0.983785,0.970034,0.990413,0.990275,0.949794,0.990344,0.050206,16909.400391,893.799988,906.799988,92336.601562,20.070271,2.266959
"""AE""",0.981773,0.968938,0.990436,0.987832,0.950044,0.989131,0.049956,16913.800781,889.400024,1134.599976,92108.796875,12.899594,1.613766
"""AE""",0.982906,0.969567,0.990428,0.989201,0.949933,0.989814,0.050067,16911.800781,891.400024,1006.799988,92236.601562,10.798182,1.380664
"""AE""",0.982146,0.969223,0.990472,0.988244,0.950201,0.989356,0.049799,16916.800781,886.400024,1096.199951,92147.203125,9.89394,1.347006
"""AE""",0.983126,0.96976,0.99046,0.989434,0.950087,0.989946,0.049913,16914.599609,888.599976,985.200012,92258.203125,9.879184,1.302922
"""AE""",0.983645,0.969939,0.990405,0.990114,0.949764,0.990259,0.050236,16908.800781,894.400024,921.799988,92321.601562,10.388388,1.374155
"""AE""",0.983373,0.969791,0.990408,0.989784,0.949799,0.990096,0.050201,16909.400391,893.799988,952.599976,92290.796875,10.50289,1.42697


In [None]:
metrics_df.write_csv(f"metrics_results/balanced_AE_metrics_output.csv", separator=';')