In [1]:
import polars as pl
import numpy as np
import time
import ipaddress
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import GridSearchCV

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_manter = ['ts', 'id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']  
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_manter]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars = df_polars.drop_nulls()

In [9]:
df_polars

ts,id.orig_p,id.resp_p,conn_state,history,orig_ip_bytes,label
f64,i32,i32,i64,i64,i64,i32
1.5322e9,5526,37215,0,0,40,1
1.5326e9,60403,23,2,7,40,1
1.5326e9,13386,81,2,7,40,1
1.5455e9,36097,37215,0,0,40,1
1.5454e9,36097,37215,0,0,40,1
…,…,…,…,…,…,…
1.5514e9,30535,8081,1,1,80,1
1.5454e9,36097,37215,0,0,40,1
1.5514e9,41258,23,1,1,120,1
1.5514e9,36658,23,1,1,120,1


In [10]:
df_polars = df_polars.with_columns(pl.col("ts").cast(pl.Datetime))
df_polars = df_polars.with_columns(pl.col("ts").cast(pl.Int64))

In [11]:
# Ordenar apenas pelo campo de tempo "ts"
df_polars = df_polars.sort("ts")

window_size = 5  # Tamanho da janela temporal
step_size = 1  # Passo entre janelas

def create_sequences(df, window_size, step_size):
    sequences, labels = [], []
    
    # Particiona os dados por "ts"
    for group in df.partition_by("ts"):
        # Remover a coluna "ts" antes de converter para numpy
        group_np = group.drop("ts").to_numpy()
        
        for i in range(0, len(group_np) - window_size, step_size):
            seq = group_np[i:i + window_size, :-1]  # Características (removendo o último valor que é o rótulo)
            label = group_np[i + window_size - 1, -1]  # Última linha como rótulo (a coluna 'label')
            sequences.append(seq)
            labels.append(label)
    
    return np.array(sequences), np.array(labels)

# Gerando X e y
X, y = create_sequences(df_polars, window_size, step_size)

In [12]:
df_polars

ts,id.orig_p,id.resp_p,conn_state,history,orig_ip_bytes,label
i64,i32,i32,i64,i64,i64,i32
1525879873,37334,23,0,0,180,1
1525879921,40983,56742,0,0,180,0
1525879944,46566,8080,0,0,60,1
1525879960,36497,8080,0,0,60,1
1525880010,40761,2323,0,0,60,1
…,…,…,…,…,…,…
1569018535,56399,62336,4,7,0,1
1569018535,44790,62336,4,7,0,1
1569018535,9799,62336,4,7,0,1
1569018535,16739,62336,4,7,0,1


# Seleção de Hiperparâmetros

In [13]:
class GRUClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate):
        super(GRUClassifier, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, batch_first=True, dropout=dropout_rate, bidirectional=False)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, hidden = self.gru(x)
        hidden = self.relu(hidden[-1])
        hidden = self.dropout(hidden)
        return self.fc(hidden)

In [14]:
results = []

In [15]:

def startTrain(X_train, y_train, X_test, y_test, input_dim, hidden_dim, dropout_rate, num_epochs):
    num_epochs = num_epochs
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
    # Criar o modelo
    model = GRUClassifier(input_dim, hidden_dim, output_dim=1, dropout_rate=dropout_rate).to(device)

    # Configurações de perda e otimizador
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Converteção para tensores
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)

    # DataLoader
    batch_size = 512
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Teinamento do modelo
    for epoch in range(num_epochs):
            model.train()
            epoch_loss = 0
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), targets)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item() * inputs.size(0)

    # Avaliação do modelo
    model.eval()
    all_outputs, all_targets = [], []
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            all_outputs.append(outputs.cpu())
            all_targets.append(targets.cpu())

    all_outputs = torch.cat(all_outputs)
    all_targets = torch.cat(all_targets)

    y_pred = (all_outputs > 0.5).float().numpy()
    y_true = all_targets.numpy()

    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

In [16]:

def grid_search(X, y):
    kfold = KFold(n_splits=5, shuffle=True)
    
    # Definindo os hiperparâmetros para o GridSearch
    param_grid = {
        'hidden_dim': [50, 100, 200],  # Número de unidades na camada GRU
        'dropout_rate': [0.2, 0.3, 0.4],  # Taxa de dropout
        'num_epochs': [5, 10]  # Número de épocas
    }

    best_accuracy = 0
    best_params = {}

    # Iterando sobre as combinações de hiperparâmetros
    for hidden_dim in param_grid['hidden_dim']:
        for dropout_rate in param_grid['dropout_rate']:
            for num_epochs in param_grid['num_epochs']:
                fold_accuracies = []

                # K-Fold cross-validation
                for train_idx, test_idx in kfold.split(X, y):
                    X_train, X_test = X[train_idx], X[test_idx]
                    y_train, y_test = y[train_idx], y[test_idx]
                    
                    scaler = MinMaxScaler()
                    X_train = X_train.reshape(-1, X_train.shape[-1])  
                    X_train = scaler.fit_transform(X_train)
                    X_train = X_train.reshape(-1, window_size, X_train.shape[-1])  # Reformatação após normalização
                    
                    X_test = X_test.reshape(-1, X_test.shape[-1])  
                    X_test = scaler.transform(X_test)  # Normalização
                    X_test = X_test.reshape(-1, window_size, X_test.shape[-1]) 

                    # Treinar e avaliar o modelo com a combinação atual de hiperparâmetros
                    accuracy = startTrain(X_train, y_train, X_test, y_test, 
                                          input_dim=X_train.shape[2], 
                                          hidden_dim=hidden_dim, 
                                          dropout_rate=dropout_rate, 
                                          num_epochs=num_epochs)
                    fold_accuracies.append(accuracy)

                mean_accuracy = np.mean(fold_accuracies)
                print(f"hidden_dim={hidden_dim}, dropout_rate={dropout_rate}, num_epochs={num_epochs} -> Mean Accuracy: {mean_accuracy}")

                # Atualizar os melhores parâmetros
                if mean_accuracy > best_accuracy:
                    best_accuracy = mean_accuracy
                    best_params = {'hidden_dim': hidden_dim, 'dropout_rate': dropout_rate, 'num_epochs': num_epochs}

    print("\nMelhores Hiperparâmetros encontrados:", best_params)
    print(f"Melhor Acurácia Média: {best_accuracy}")

    return best_params

In [17]:
inicio = time.time()
best_params = grid_search(X,y)
fim = time.time()



hidden_dim=50, dropout_rate=0.2, num_epochs=5 -> Mean Accuracy: 0.9886623109381178




hidden_dim=50, dropout_rate=0.2, num_epochs=10 -> Mean Accuracy: 0.9967874991603732




hidden_dim=50, dropout_rate=0.3, num_epochs=5 -> Mean Accuracy: 0.9900928231045137




hidden_dim=50, dropout_rate=0.3, num_epochs=10 -> Mean Accuracy: 0.9967171488747566




hidden_dim=50, dropout_rate=0.4, num_epochs=5 -> Mean Accuracy: 0.9833395441479915




hidden_dim=50, dropout_rate=0.4, num_epochs=10 -> Mean Accuracy: 0.9968226760214394




hidden_dim=100, dropout_rate=0.2, num_epochs=5 -> Mean Accuracy: 0.9960723135257453




hidden_dim=100, dropout_rate=0.2, num_epochs=10 -> Mean Accuracy: 0.9968578583809308




hidden_dim=100, dropout_rate=0.3, num_epochs=5 -> Mean Accuracy: 0.9962012763342682




hidden_dim=100, dropout_rate=0.3, num_epochs=10 -> Mean Accuracy: 0.9967171578096974




hidden_dim=100, dropout_rate=0.4, num_epochs=5 -> Mean Accuracy: 0.9961778454835499




hidden_dim=100, dropout_rate=0.4, num_epochs=10 -> Mean Accuracy: 0.9966819830105405




hidden_dim=200, dropout_rate=0.2, num_epochs=5 -> Mean Accuracy: 0.99657646960992




hidden_dim=200, dropout_rate=0.2, num_epochs=10 -> Mean Accuracy: 0.9963419769055017




hidden_dim=200, dropout_rate=0.3, num_epochs=5 -> Mean Accuracy: 0.9965178612108325




hidden_dim=200, dropout_rate=0.3, num_epochs=10 -> Mean Accuracy: 0.9968226794579552




hidden_dim=200, dropout_rate=0.4, num_epochs=5 -> Mean Accuracy: 0.9965647469678782




hidden_dim=200, dropout_rate=0.4, num_epochs=10 -> Mean Accuracy: 0.9965647497170906

Melhores Hiperparâmetros encontrados: {'hidden_dim': 100, 'dropout_rate': 0.2, 'num_epochs': 10}
Melhor Acurácia Média: 0.9968578583809308


In [18]:
duracao = fim - inicio
print(duracao)

1926.483473777771


In [19]:
print("\nMelhores Hiperparâmetros encontrados:", best_params)


Melhores Hiperparâmetros encontrados: {'hidden_dim': 100, 'dropout_rate': 0.2, 'num_epochs': 10}
