In [1]:
import polars as pl
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [2]:
df_polars = pl.read_parquet('dataset.parquet')

In [3]:
df_polars = df_polars.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None


In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars = df_polars.drop_nulls()

In [9]:
X = df_polars.drop('label')
y = df_polars['label']       

In [10]:
X = X.to_numpy()
X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

In [11]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Treinamento

In [12]:
results = []

In [13]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, dropout=dropout_rate, bidirectional=False)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, (hidden, _) = self.lstm(x)
        hidden = self.relu(hidden[-1])
        hidden = self.dropout(hidden)
        output = self.fc(hidden)
        return output


In [14]:
def startTrain():

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

    y_train_np = y_train.to_numpy()
    y_test_np = y_test.to_numpy()

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_np, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    input_dim = X_train.shape[1]  
    hidden_dim = 100  
    dropout_rate = 0.2  
    output_dim = 1  

    model = LSTMClassifier(input_dim, hidden_dim, output_dim, dropout_rate).to(device)

    batch_size = 5000
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    criterion = nn.BCEWithLogitsLoss() 
    optimizer = optim.Adam(model.parameters(), lr=0.001)


    epochs = 10
    #print(datetime.now)
    start_training = time.time()
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for data in train_loader:
            inputs, targets = data
            inputs, targets = inputs.float().to(device), targets.float().to(device)
            inputs = inputs.unsqueeze(1) 
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * inputs.size(0)
        end_training = time.time()
        #print(f'Epoch {epoch+1}/{epochs}, fim em: {datetime.now()}')
    
        model.eval()
        with torch.no_grad():
            all_outputs = []
            all_targets = []
            for data in test_loader:
                inputs, targets = data
                inputs, targets = inputs.float().to(device), targets.float().to(device)
                inputs = inputs.unsqueeze(1)  
                outputs = model(inputs)
                all_outputs.append(outputs.cpu())
                all_targets.append(targets.cpu())
            
        all_outputs = torch.cat(all_outputs)
        all_targets = torch.cat(all_targets)

        y_pred = (all_outputs > 0.5).float().numpy()
        y_true = all_targets.numpy()
        evaluation_time = time.time()
        #print(f'Epoch {epoch+1}/{epochs}, avaliada em: {datetime.now()}')

    training_duration = end_training - start_training
    evaluation_duration = evaluation_time - end_training
    confusion = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = confusion.ravel()
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    specificity = tn / (tn + fp)
    f1 = f1_score(y_true, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
    false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
        
    #results.append([epoch+1, accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp])
    results.append(["LSTM", accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp, training_duration, evaluation_duration])

In [15]:
for i in range(1,31):
    startTrain()
    print(i)



1




2




3




4




5




6




7




8




9




10




11




12




13




14




15




16




17




18




19




20




21




22




23




24




25




26




27




28




29




30


In [16]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy' , 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df

  return dispatch(args[0].__class__)(*args, **kw)


Algorithm,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp,training_duration,evaluation_duration
str,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,f64,f64
"""LSTM""",0.955592,0.951546,0.989267,0.957502,0.945591,0.973125,0.054409,25252,1453,5944,133921,87.819078,2.644053
"""LSTM""",0.956265,0.950492,0.988583,0.958989,0.941996,0.973561,0.058004,25156,1549,5736,134129,92.902134,2.84773
"""LSTM""",0.957399,0.953425,0.989673,0.959275,0.947575,0.974237,0.052425,25305,1400,5696,134169,96.294487,2.118462
"""LSTM""",0.958624,0.953124,0.989199,0.96122,0.945029,0.975009,0.054971,25237,1468,5424,134441,88.887273,2.442634
"""LSTM""",0.959573,0.95431,0.989506,0.962056,0.946564,0.975588,0.053436,25278,1427,5307,134558,94.43725,2.685967
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""LSTM""",0.95776,0.952367,0.989072,0.960305,0.94443,0.974476,0.05557,25221,1484,5552,134313,75.999855,2.068467
"""LSTM""",0.957177,0.951839,0.988978,0.959697,0.943981,0.974117,0.056019,25209,1496,5637,134228,82.167194,2.382336
"""LSTM""",0.958966,0.953404,0.989239,0.961592,0.945216,0.975219,0.054784,25242,1463,5372,134493,98.986039,2.834619
"""LSTM""",0.955784,0.950403,0.98867,0.958324,0.942483,0.973261,0.057517,25169,1536,5829,134036,83.377934,2.162591


In [17]:
metrics_df.write_csv(f"metrics_results/unbalanced_LSTM_metrics_output.csv", separator=';')