In [1]:
import polars as pl
import numpy as np
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score

import math
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [2]:
df_polars = pl.read_parquet('dataset.parquet')

In [3]:
df_polars = df_polars.sample(fraction=0.01, seed=42)

In [4]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [5]:
df_polars = df_polars.drop(["ts", "uid", "id.orig_h", "id.resp_h", "local_orig", "local_resp", "missed_bytes" , "tunnel_parents", "detailed-label", "__index_level_0__"])

In [6]:
X = df_polars.drop('label')
y = df_polars['label']       

In [7]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [8]:
# calcular número desejado de características
def calculate_desired_num_features(current_num_features):
    root = math.sqrt(current_num_features)
    desired_num_features = math.ceil(root) ** 2
    
    return desired_num_features

In [9]:
desired_num_features = 16  # 4x4
current_num_features = X.shape[1]
desired_num_features = calculate_desired_num_features(current_num_features)

if current_num_features < desired_num_features:
    padding = desired_num_features - current_num_features
    X = np.pad(X, ((0, 0), (0, padding)), 'constant')
elif current_num_features > desired_num_features:
    raise ValueError("Número de características é maior que o desejado.")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [10]:
# Redimensione os dados
SAMPLE_2D_SIZE = 4  # 4x4
X_train = X_train.reshape(-1, 1, SAMPLE_2D_SIZE, SAMPLE_2D_SIZE)
X_test = X_test.reshape(-1, 1, SAMPLE_2D_SIZE, SAMPLE_2D_SIZE)

In [11]:
y_train_np = y_train.to_numpy()
y_test_np = y_test.to_numpy()

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_np, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32)

In [12]:
batch_size = 10000
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Treinamento

In [13]:
class CNNModel(nn.Module):
    def __init__(self, dropout_rate=0.2):
        super(CNNModel, self).__init__()
        
        # Camadas Convolucionais
        self.conv1 = nn.Conv2d(1, 64, kernel_size=2, stride=1, padding=0)
        self.relu1 = nn.ReLU()
        
        self.conv2 = nn.Conv2d(64, 64, kernel_size=2, stride=1, padding=0)
        self.relu2 = nn.ReLU()
        
        # Camada de Pooling
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # Calcula o tamanho da saída após as camadas convolucionais e de pooling
        def conv2d_out_size(size, kernel_size=2, stride=1, padding=0):
            return (size - kernel_size + 2 * padding) // stride + 1
        
        size_after_conv1 = conv2d_out_size(SAMPLE_2D_SIZE, kernel_size=2)
        size_after_conv2 = conv2d_out_size(size_after_conv1, kernel_size=2)
        size_after_pool = conv2d_out_size(size_after_conv2, kernel_size=2, stride=2)
        
        # Camada Fully Connected
        self.fc1 = nn.Linear(64 * size_after_pool * size_after_pool, 64)
        self.relu3 = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(64, 1)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.pool1(x)
        
        x = x.view(x.size(0), -1)
        
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x.squeeze()


model = CNNModel(dropout_rate=0.2)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 6
results = []
print(datetime.now)
for epoch in range(epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}/{epochs}, fim em: {datetime.now()}')
    
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            preds = torch.sigmoid(outputs)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())
    
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    y_pred = (all_preds >= 0.5).astype(int)
    print(f'Epoch {epoch+1}/{epochs}, avaliada em: {datetime.now()}')

    confusion = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = confusion.ravel()
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    specificity = tn / (tn + fp)
    f1 = f1_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
    
    results.append([epoch+1, accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp])


<built-in method now of type object at 0x00007FFA9A814FB0>
Epoch 1/6, fim em: 2024-09-16 10:24:22.321157
Epoch 1/6, avaliada em: 2024-09-16 10:24:24.918744
Epoch 2/6, fim em: 2024-09-16 10:24:32.787524
Epoch 2/6, avaliada em: 2024-09-16 10:24:35.246079
Epoch 3/6, fim em: 2024-09-16 10:24:43.323907
Epoch 3/6, avaliada em: 2024-09-16 10:24:45.788464
Epoch 4/6, fim em: 2024-09-16 10:24:53.678249
Epoch 4/6, avaliada em: 2024-09-16 10:24:56.189817
Epoch 5/6, fim em: 2024-09-16 10:25:04.144617
Epoch 5/6, avaliada em: 2024-09-16 10:25:06.887238
Epoch 6/6, fim em: 2024-09-16 10:25:17.079569
Epoch 6/6, avaliada em: 2024-09-16 10:25:20.202598


In [14]:
metrics_df = pl.DataFrame(
    results,
    schema=['epoch', 'Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp']
)
metrics_df

  return dispatch(args[0].__class__)(*args, **kw)


epoch,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp
i64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64
1,0.839677,0.5,0.839677,1.0,0.0,0.912853,1.0,0,26705,0,139865
2,0.839677,0.5,0.839677,1.0,0.0,0.912853,1.0,0,26705,0,139865
3,0.839677,0.5,0.839677,1.0,0.0,0.912853,1.0,0,26705,0,139865
4,0.980657,0.961472,0.987283,0.989712,0.933233,0.988496,0.066767,24922,1783,1439,138426
5,0.977241,0.959681,0.987343,0.985529,0.933833,0.986435,0.066167,24938,1767,2024,137841
6,0.976869,0.959414,0.987317,0.985107,0.93372,0.986211,0.06628,24935,1770,2083,137782
