In [19]:
import polars as pl
import numpy as np
import time
import ipaddress
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label', 'ts']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars = df_polars.drop_nulls()


In [9]:
df_polars

ts,id.orig_h,id.orig_p,id.resp_p,conn_state,history,orig_ip_bytes,label
f64,str,i32,i32,i64,i64,i64,i32
1.5322e9,"""192.168.100.108""",5526,37215,0,0,40,1
1.5326e9,"""192.168.100.111""",60403,23,2,7,40,1
1.5326e9,"""192.168.100.111""",13386,81,2,7,40,1
1.5455e9,"""192.168.1.198""",36097,37215,0,0,40,1
1.5454e9,"""192.168.1.198""",36097,37215,0,0,40,1
…,…,…,…,…,…,…,…
1.5514e9,"""192.168.1.193""",30535,8081,1,1,80,1
1.5454e9,"""192.168.1.198""",36097,37215,0,0,40,1
1.5514e9,"""192.168.1.200""",41258,23,1,1,120,1
1.5514e9,"""192.168.1.200""",36658,23,1,1,120,1


In [10]:
df_polars = df_polars.with_columns(pl.col("ts").cast(pl.Datetime))


In [11]:
df_polars = df_polars.with_columns(pl.col("ts").cast(pl.Int64))


In [12]:
df_polars = df_polars.sort(["id.orig_h", "ts"])  # Ordenando por IP de origem e tempo

window_size = 5  # Tamanho da janela temporal
step_size = 1  # Passo entre janelas

def create_sequences(df, window_size, step_size):
    sequences, labels = [], []
    for group in df.partition_by("id.orig_h"):  # Correção aqui
        group_np = group.drop("id.orig_h").to_numpy()
        for i in range(0, len(group_np) - window_size, step_size):
            seq = group_np[i:i + window_size, :-1]  # Features
            label = group_np[i + window_size - 1, -1]  # Última linha como rótulo
            sequences.append(seq)
            labels.append(label)
    return np.array(sequences), np.array(labels)

X, y = create_sequences(df_polars, window_size, step_size)


In [13]:
df_polars

ts,id.orig_h,id.orig_p,id.resp_p,conn_state,history,orig_ip_bytes,label
i64,str,i32,i32,i64,i64,i64,i32
1537537487,"""0.0.0.0""",68,67,0,0,3280,0
1525996281,"""101.100.216.176""",3,3,3,4,68,0
1526142029,"""103.254.211.56""",3,3,3,4,68,0
1526071458,"""104.227.148.64""",3,3,3,4,68,0
1526031395,"""104.251.224.11""",3,10,3,4,88,0
…,…,…,…,…,…,…,…
1525912775,"""99.167.37.225""",3,1,3,4,88,0
1525881896,"""99.246.132.175""",3,3,3,4,88,0
1537529902,"""::""",135,136,3,8,64,0
1562160913,"""fe80::5bcc:698e:39d5:cdf""",133,134,2,2,192,0


In [14]:
scaler = MinMaxScaler()
X = X.reshape(-1, X.shape[-1])  # Flatten antes de normalizar
X = scaler.fit_transform(X)
X = X.reshape(-1, window_size, X.shape[-1])  # Reformatando após normalização
results = []

# Treinamento

In [15]:
class GRUClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate):
        super(GRUClassifier, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, batch_first=True, dropout=dropout_rate, bidirectional=False)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, hidden = self.gru(x)
        hidden = self.relu(hidden[-1])
        hidden = self.dropout(hidden)
        return self.fc(hidden)  # Sem ativação aqui!

In [25]:
results = []

In [28]:
def startTrain():
    # KFold cross-validation com 5 folds
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Lista para armazenar os resultados de cada fold

    # Inicia o tempo de treinamento
    start_training = time.time()

    # Loop sobre os folds
    for fold, (train_idx, test_idx) in enumerate(kfold.split(X, y), 1):
        print(f"Fold {fold} começando...")

        # Divide os dados de treino e teste com base nos índices
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Converte os dados para tensores
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
        y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

        # Coloca o modelo no dispositivo correto (GPU ou CPU)
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Definir os parâmetros do modelo
        input_dim = X_train.shape[2]  # Número de features por timestep
        hidden_dim = 100  # Número de neurônios na GRU
        dropout_rate = 0.2  # Dropout para evitar overfitting
        output_dim = 1  # Classificação binária

        # Cria o modelo
        model = GRUClassifier(input_dim, hidden_dim, output_dim, dropout_rate).to(device)

        # Define a função de perda e o otimizador
        criterion = nn.BCEWithLogitsLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        # Configura o DataLoader para treino e teste
        batch_size = 512
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

        # Treinamento
        model.train()
        epoch_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * inputs.size(0)

        # Avaliação no conjunto de teste
        model.eval()
        all_outputs, all_targets = [], []
        with torch.no_grad():
            for inputs, targets in test_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                all_outputs.append(outputs.cpu())
                all_targets.append(targets.cpu())

        all_outputs = torch.cat(all_outputs)
        all_targets = torch.cat(all_targets)

        # Transformando a saída para 0 ou 1
        y_pred = (all_outputs > 0.5).float().numpy()
        y_true = all_targets.numpy()

        # Cálculo das métricas
        confusion = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = confusion.ravel()
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        specificity = tn / (tn + fp)
        f1 = f1_score(y_true, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
        false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

        # Salva os resultados do fold
        end_training = time.time()
        training_duration = end_training - start_training
        evaluation_duration = time.time() - end_training
        print(f"Fold {fold}: Acurácia = {accuracy:.4f}, Precisão = {precision:.4f}, Recall = {recall:.4f}, F1 = {f1:.4f}")


    results.append([f"Fold {fold}", accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp, training_duration, evaluation_duration])

In [29]:
for i in range(1,2):
    startTrain()
    print(i)

Fold 1 começando...
Fold 1: Acurácia = 0.9914, Precisão = 0.9965, Recall = 0.9933, F1 = 0.9949
Fold 2 começando...




Fold 2: Acurácia = 0.9877, Precisão = 0.9980, Recall = 0.9874, F1 = 0.9927
Fold 3 começando...




Fold 3: Acurácia = 0.9932, Precisão = 0.9940, Recall = 0.9980, F1 = 0.9960
Fold 4 começando...




Fold 4: Acurácia = 0.9873, Precisão = 0.9979, Recall = 0.9869, F1 = 0.9924
Fold 5 começando...




Fold 5: Acurácia = 0.9871, Precisão = 0.9985, Recall = 0.9862, F1 = 0.9923
1


In [30]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df

  return dispatch(args[0].__class__)(*args, **kw)


Algorithm,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp,training_duration,evaluation_duration
str,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,f64,f64
"""Fold 5""",0.987149,0.989196,0.998502,0.98619,0.992203,0.992308,0.007797,17560,138,1288,91976,107.570482,0.0


In [19]:
metrics_df.write_csv(f"metrics_results/unbalanced_GRU_metrics_output.csv", separator=';')

In [None]:
# Exibindo a média dos resultados
print("\nMédia dos resultados após K-Fold cross-validation:")
print(results_df.mean())
