In [1]:
import polars as pl
import numpy as np
import time
import ipaddress
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [2]:
df_polars_raiz = pl.read_parquet('dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label', 'ts']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars = df_polars.drop_nulls()


In [10]:
df_polars

ts,id.orig_h,id.orig_p,id.resp_p,conn_state,history,orig_ip_bytes,label
f64,str,i32,i32,i64,i64,i64,i32
1.5322e9,"""192.168.100.108""",5526,37215,0,0,40,1
1.5326e9,"""192.168.100.111""",60403,23,2,7,40,1
1.5326e9,"""192.168.100.111""",13386,81,2,7,40,1
1.5455e9,"""192.168.1.198""",36097,37215,0,0,40,1
1.5454e9,"""192.168.1.198""",36097,37215,0,0,40,1
…,…,…,…,…,…,…,…
1.5514e9,"""192.168.1.193""",30535,8081,1,1,80,1
1.5454e9,"""192.168.1.198""",36097,37215,0,0,40,1
1.5514e9,"""192.168.1.200""",41258,23,1,1,120,1
1.5514e9,"""192.168.1.200""",36658,23,1,1,120,1


In [12]:
df_polars = df_polars.with_columns(pl.col("ts").cast(pl.Datetime))


In [13]:
df_polars = df_polars.with_columns(pl.col("ts").cast(pl.Int64))


In [None]:
#df_polars = df_polars.with_columns(pl.col("ts").str.to_datetime("%Y-%m-%d %H:%M:%S").cast(pl.Int64))

SchemaError: invalid series dtype: expected `String`, got `f64` for series with name `ts`

In [19]:
df_polars = df_polars.sort(["id.orig_h", "ts"])  # Ordenando por IP de origem e tempo

window_size = 5  # Tamanho da janela temporal
step_size = 1  # Passo entre janelas

def create_sequences(df, window_size, step_size):
    sequences, labels = [], []
    for group in df.partition_by("id.orig_h"):  # Correção aqui
        group_np = group.drop("id.orig_h").to_numpy()
        for i in range(0, len(group_np) - window_size, step_size):
            seq = group_np[i:i + window_size, :-1]  # Features
            label = group_np[i + window_size - 1, -1]  # Última linha como rótulo
            sequences.append(seq)
            labels.append(label)
    return np.array(sequences), np.array(labels)

X, y = create_sequences(df_polars, window_size, step_size)


In [31]:
scaler = MinMaxScaler()
X = X.reshape(-1, X.shape[-1])  # Flatten antes de normalizar
X = scaler.fit_transform(X)
X = X.reshape(-1, window_size, X.shape[-1])  # Reformatando após normalização
results = []

# Treinamento

In [25]:
class GRUClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate):
        super(GRUClassifier, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, batch_first=True, dropout=dropout_rate, bidirectional=False)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, hidden = self.gru(x)
        hidden = self.relu(hidden[-1])
        hidden = self.dropout(hidden)
        return self.fc(hidden)  # Sem ativação aqui!

In [32]:
def startTrain():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    input_dim = X_train.shape[2]
    hidden_dim = 100  
    dropout_rate = 0.2  
    output_dim = 1  

    model = GRUClassifier(input_dim, hidden_dim, output_dim, dropout_rate).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    batch_size = 512
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    epochs = 2
    start_training = time.time()

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * inputs.size(0)

        # Avaliação no conjunto de teste
        model.eval()
        all_outputs, all_targets = [], []
        with torch.no_grad():
            for inputs, targets in test_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                all_outputs.append(outputs.cpu())
                all_targets.append(targets.cpu())

        all_outputs = torch.cat(all_outputs)
        all_targets = torch.cat(all_targets)

        y_pred = (all_outputs > 0.5).float().numpy()
        y_true = all_targets.numpy()

        end_training = time.time()

        # Cálculo das métricas
        confusion = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = confusion.ravel()
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        specificity = tn / (tn + fp)
        f1 = f1_score(y_true, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
        false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

        training_duration = end_training - start_training
        evaluation_duration = time.time() - end_training
        print(f'Epoch {epoch+1}: Acurácia = {accuracy:.4f}, Precisão = {precision:.4f}, Recall = {recall:.4f}, F1 = {f1:.4f}')

    results.append(["GRU", accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp, training_duration, evaluation_duration])


In [33]:
for i in range(1,2):
    startTrain()
    print(i)



Epoch 1: Acurácia = 0.9912, Precisão = 0.9966, Recall = 0.9930, F1 = 0.9948
Epoch 2: Acurácia = 0.9952, Precisão = 0.9955, Recall = 0.9988, F1 = 0.9971
1


In [34]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df

  return dispatch(args[0].__class__)(*args, **kw)


Algorithm,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp,training_duration,evaluation_duration
str,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,f64,f64
"""GRU""",0.995158,0.987462,0.995482,0.99877,0.976155,0.997123,0.023845,25954,634,172,139683,36.010492,0.36785


In [18]:
metrics_df.write_csv(f"metrics_results/unbalanced_GRU_metrics_output.csv", separator=';')