In [1]:
import polars as pl
import numpy as np
import time
import ipaddress
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_manter = ['ts', 'id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']  
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_manter]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars = df_polars.drop_nulls()


In [9]:
df_polars

ts,id.orig_p,id.resp_p,conn_state,history,orig_ip_bytes,label
f64,i32,i32,i64,i64,i64,i32
1.5322e9,5526,37215,0,0,40,1
1.5326e9,60403,23,2,7,40,1
1.5326e9,13386,81,2,7,40,1
1.5455e9,36097,37215,0,0,40,1
1.5454e9,36097,37215,0,0,40,1
…,…,…,…,…,…,…
1.5514e9,30535,8081,1,1,80,1
1.5454e9,36097,37215,0,0,40,1
1.5514e9,41258,23,1,1,120,1
1.5514e9,36658,23,1,1,120,1


In [10]:
df_polars = df_polars.with_columns(pl.col("ts").cast(pl.Int64))

In [11]:
df_polars

ts,id.orig_p,id.resp_p,conn_state,history,orig_ip_bytes,label
i64,i32,i32,i64,i64,i64,i32
1532150893,5526,37215,0,0,40,1
1532570324,60403,23,2,7,40,1
1532564882,13386,81,2,7,40,1
1545465243,36097,37215,0,0,40,1
1545398682,36097,37215,0,0,40,1
…,…,…,…,…,…,…
1551404209,30535,8081,1,1,80,1
1545414126,36097,37215,0,0,40,1
1551402739,41258,23,1,1,120,1
1551405886,36658,23,1,1,120,1


In [12]:
# Ordenar apenas pelo campo de tempo "ts"
df_polars = df_polars.sort("ts")

window_size = 5  # Tamanho da janela temporal
step_size = 1  # Passo entre janelas

def create_sequences(df, window_size, step_size):
    sequences, labels = [], []
    
    # Remover a coluna "ts" antes de converter para numpy
    group_np = df.drop("ts").to_numpy()
    
    for i in range(0, len(group_np) - window_size, step_size):
        seq = group_np[i:i + window_size, :-1]  # Características
        label = group_np[i + window_size - 1, -1]  # Última linha como rótulo
        sequences.append(seq)
        labels.append(label)
    
    return np.array(sequences), np.array(labels)


# Gerando X e y
X, y = create_sequences(df_polars, window_size, step_size)

In [13]:
print("Shape de X:", X.shape)
print("Shape de y:", y.shape)

Shape de X: (555228, 5, 5)
Shape de y: (555228,)


In [14]:
df_polars

ts,id.orig_p,id.resp_p,conn_state,history,orig_ip_bytes,label
i64,i32,i32,i64,i64,i64,i32
1525879873,37334,23,0,0,180,1
1525879921,40983,56742,0,0,180,0
1525879944,46566,8080,0,0,60,1
1525879960,36497,8080,0,0,60,1
1525880010,40761,2323,0,0,60,1
…,…,…,…,…,…,…
1569018535,56399,62336,4,7,0,1
1569018535,44790,62336,4,7,0,1
1569018535,9799,62336,4,7,0,1
1569018535,16739,62336,4,7,0,1


In [15]:
scaler = MinMaxScaler()
X = X.reshape(-1, X.shape[-1])
X = scaler.fit_transform(X)
X = X.reshape(-1, window_size, X.shape[-1])
results = []

# Treinamento

In [16]:
class GRUClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate):
        super(GRUClassifier, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, batch_first=True, dropout=dropout_rate, bidirectional=False)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, hidden = self.gru(x)
        hidden = self.relu(hidden[-1])
        hidden = self.dropout(hidden)
        return self.fc(hidden)

In [17]:
results = []

In [18]:
def startTrain(hidden_dim, dropout_rate, num_epochs):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    start_training = time.time()
    results_fold = []
    #num_epochs = 10  # Adicionado número de épocas para melhor treinamento

    for fold, (train_idx, test_idx) in enumerate(kfold.split(X, y), 1):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Normalização correta para evitar vazamento de dados
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(-1, window_size, X_train.shape[-1])
        X_test = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(-1, window_size, X_test.shape[-1])

        X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
        y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        input_dim = X_train.shape[2]
        #hidden_dim = 100
        #dropout_rate = 0.2
        output_dim = 1

        model = GRUClassifier(input_dim, hidden_dim, output_dim, dropout_rate).to(device)

        criterion = nn.BCEWithLogitsLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        batch_size = 512
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

        for epoch in range(num_epochs):
            model.train()
            epoch_loss = 0
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), targets)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item() * inputs.size(0)
            #print(f"Fold {fold}, Época {epoch+1}/{num_epochs}, Loss: {epoch_loss / len(train_loader.dataset):.4f}")
        
        end_training = time.time()

        model.eval()
        all_outputs, all_targets = [], []
        with torch.no_grad():
            for inputs, targets in test_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                all_outputs.append(outputs.cpu())
                all_targets.append(targets.cpu())

        all_outputs = torch.cat(all_outputs)
        all_targets = torch.cat(all_targets)
        
        y_pred = (all_outputs > 0.5).float().numpy()
        y_true = all_targets.numpy()

        confusion = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = confusion.ravel()
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        f1 = f1_score(y_true, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
        false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
        
        training_duration = end_training - start_training
        evaluation_duration = time.time() - end_training
        avaliacao = [accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp, training_duration, evaluation_duration]
        #print(avaliacao)
        results_fold.append(avaliacao)

    results_fold_array = np.array(results_fold, dtype=np.float32)
    mean_results = np.mean(results_fold_array, axis=0)
    results.append(["GRU"] + mean_results.tolist())

In [19]:
#Melhores Hiperparâmetros encontrados: {'hidden_dim': 100, 'dropout_rate': 0.2, 'num_epochs': 10}

In [20]:
for i in range(1,11):
    startTrain(hidden_dim=100,dropout_rate=0.2, num_epochs=10)
    print(i)



1




2




3




4




5




6




7




8




9




10


In [21]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df

  return dispatch(args[0].__class__)(*args, **kw)


Algorithm,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp,training_duration,evaluation_duration
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""GRU""",0.993741,0.989042,0.996585,0.995959,0.982126,0.996272,0.017874,17484.800781,318.200012,376.799988,92865.796875,585.624695,2.811311
"""GRU""",0.993414,0.989229,0.996763,0.995389,0.98307,0.996075,0.01693,17501.599609,301.399994,430.0,92812.601562,583.140991,2.760882
"""GRU""",0.993359,0.988966,0.996652,0.995437,0.982496,0.996043,0.017504,17491.199219,311.799988,425.600006,92817.0,589.285156,2.794614
"""GRU""",0.993669,0.988993,0.996581,0.995878,0.982108,0.996229,0.017892,17484.400391,318.600006,384.399994,92858.203125,595.14856,2.751181
"""GRU""",0.993386,0.989265,0.996786,0.995333,0.983196,0.996059,0.016804,17503.800781,299.200012,435.200012,92807.398438,561.066711,2.673665
"""GRU""",0.993642,0.989261,0.996715,0.99571,0.982812,0.996212,0.017188,17497.0,306.0,400.0,92842.601562,567.010437,2.724485
"""GRU""",0.993491,0.988892,0.996582,0.995663,0.98212,0.996122,0.01788,17484.599609,318.399994,404.399994,92838.203125,593.248047,2.768949
"""GRU""",0.993403,0.989378,0.996836,0.995303,0.983454,0.996068,0.016546,17508.400391,294.600006,438.0,92804.601562,559.894287,2.682645
"""GRU""",0.993482,0.98923,0.996744,0.995489,0.982971,0.996116,0.017029,17499.800781,303.200012,420.600006,92822.0,515.8396,2.598066
"""GRU""",0.993432,0.989042,0.996669,0.995504,0.98258,0.996086,0.01742,17492.800781,310.200012,419.200012,92823.398438,572.918091,2.699635


In [22]:
metrics_df.write_csv(f"metrics_results/unbalanced_GRU_metrics_output.csv", separator=';')