In [1]:
import polars as pl
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import KFold
import torch.optim as optim

import math
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars

ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label
f64,str,str,i32,str,i32,i64,i64,f64,f64,f64,i64,str,str,i32,i64,i64,i64,i64,i64,str,i32,str
1.5322e9,"""C63mNb4FhdpSoHAj9g""","""192.168.100.108""",5526,"""47.138.157.173""",37215,0,0,0.0,0.0,0.0,0,"""-""","""-""",0,0,1,40,0,0,"""(empty)""",1,"""Okiru"""
1.5326e9,"""CppkaidEFlUgsYbOh""","""192.168.100.111""",60403,"""147.32.6.210""",23,1,0,0.0,0.0,0.0,2,"""-""","""-""",0,7,1,40,0,0,"""(empty)""",1,"""PartOfAHorizontalPortScan"""
1.5326e9,"""CNXYoV2KOSjeK0XsCi""","""192.168.100.111""",13386,"""50.229.102.190""",81,1,0,0.0,0.0,0.0,2,"""-""","""-""",0,7,1,40,0,0,"""(empty)""",1,"""PartOfAHorizontalPortScan"""
1.5455e9,"""Cv91cy38cVUjBeny4k""","""192.168.1.198""",36097,"""78.87.208.14""",37215,0,0,,,,0,"""-""","""-""",0,0,1,40,0,0,"""-""",1,"""Okiru"""
1.5454e9,"""CKvVZA3voOeDO627Ca""","""192.168.1.198""",36097,"""202.99.70.13""",37215,0,0,,,,0,"""-""","""-""",0,0,1,40,0,0,"""-""",1,"""Okiru"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1.5514e9,"""CwwQzn3658DIsMZ9uj""","""192.168.1.193""",30535,"""217.130.2.16""",8081,0,1,0.000005,0.0,0.0,1,"""-""","""-""",0,1,2,80,0,0,"""-""",1,"""PartOfAHorizontalPortScan"""
1.5454e9,"""CnykMt2RDdLvT42t8l""","""192.168.1.198""",36097,"""112.160.204.32""",37215,0,0,,,,0,"""-""","""-""",0,0,1,40,0,0,"""-""",1,"""Okiru"""
1.5514e9,"""CDjM7C2ZIehXyoB3f""","""192.168.1.200""",41258,"""134.130.151.164""",23,0,1,0.000002,0.0,0.0,1,"""-""","""-""",0,1,2,120,0,0,"""-""",1,"""PartOfAHorizontalPortScan"""
1.5514e9,"""CttXNg3OrZSHSsUTH5""","""192.168.1.200""",36658,"""171.251.198.162""",23,0,1,0.000214,0.0,0.0,1,"""-""","""-""",0,1,2,120,0,0,"""-""",1,"""PartOfAHorizontalPortScan"""


In [7]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [8]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']             
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [9]:
df_polars = df_polars.drop_nulls()

In [10]:
X = df_polars.drop('label')
y = df_polars['label']       

In [12]:
#X = X.to_numpy()
#X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

In [13]:
# calcular número desejado de características
def calculate_desired_num_features(current_num_features):
    root = math.sqrt(current_num_features)
    desired_num_features = math.ceil(root) ** 2
    
    return desired_num_features

In [14]:
print(X.shape)

(555233, 5)


# Treinamento

In [16]:
results = []
SAMPLE_2D_SIZE = 4

In [17]:
class CNNModel(nn.Module):
    def __init__(self, conv1_out_channels=64, conv2_out_channels=64, dropout_rate=0.2):
        super(CNNModel, self).__init__()
        
        self.conv1 = nn.Conv2d(1, conv1_out_channels, kernel_size=2, stride=1, padding=0)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(conv1_out_channels, conv2_out_channels, kernel_size=2, stride=1, padding=0)
        self.relu2 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        def conv2d_out_size(size, kernel_size=2, stride=1, padding=0):
            return (size - kernel_size + 2 * padding) // stride + 1
        
        size_after_conv1 = conv2d_out_size(SAMPLE_2D_SIZE, kernel_size=2)
        size_after_conv2 = conv2d_out_size(size_after_conv1, kernel_size=2)
        size_after_pool = conv2d_out_size(size_after_conv2, kernel_size=2, stride=2)
        
        self.fc1 = nn.Linear(conv2_out_channels * size_after_pool * size_after_pool, 64)
        self.relu3 = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(64, 1)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.pool1(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x.squeeze()


In [21]:
def startTrain(num_epochs, dropout_rate, learning_rate, conv1_out_channels, conv2_out_channels):
    kfold = KFold(n_splits=5, shuffle=True)
    results_fold = []
    for train_idx, test_idx in kfold.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
    
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    
        desired_num_features = 16
        current_num_features = X_train_scaled.shape[1]
    
        if current_num_features < desired_num_features:
            padding = desired_num_features - current_num_features
            X_train_scaled = np.pad(X_train_scaled, ((0, 0), (0, padding)), 'constant')
            X_test_scaled = np.pad(X_test_scaled, ((0, 0), (0, padding)), 'constant')
        elif current_num_features > desired_num_features:
            raise ValueError("Número de características maior que o desejado.")
    
        X_train_scaled = X_train_scaled.reshape(-1, 1, SAMPLE_2D_SIZE, SAMPLE_2D_SIZE)
        X_test_scaled = X_test_scaled.reshape(-1, 1, SAMPLE_2D_SIZE, SAMPLE_2D_SIZE)
    
        X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
        X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
        y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
    
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = CNNModel(conv1_out_channels=conv1_out_channels, conv2_out_channels=conv2_out_channels, dropout_rate=dropout_rate)
        criterion = nn.BCEWithLogitsLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        # DataLoader
        batch_size = 512
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

        start_training = time.time()
        
        for epoch in range(num_epochs):
            model.train()
            for X_batch, y_batch in train_loader:
                optimizer.zero_grad()
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()
        
        end_training = time.time()

        # Avaliação
        model.eval()
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                outputs = model(X_batch)
                preds = torch.sigmoid(outputs)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(y_batch.cpu().numpy())

        all_preds = np.array(all_preds)
        all_labels = np.array(all_labels)
        y_pred = (all_preds >= 0.5).astype(int)
        y_true = all_labels

        # Cálculo das métricas
        confusion = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = confusion.ravel()
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        f1 = f1_score(y_true, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
        false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

        # Cálculo de tempos
        training_duration = end_training - start_training
        evaluation_duration = time.time() - end_training

        # Registro de resultados
        avaliacao = [
            accuracy, balanced_accuracy, precision, recall, specificity, f1, 
            false_alarm_rate, tn, fp, fn, tp, training_duration, evaluation_duration
        ]
        
        #print(avaliacao)
        results_fold.append(avaliacao)
        results_fold_array = np.array(results_fold, dtype=np.float32)
    mean_results = np.mean(results_fold_array, axis=0)
    results.append(["2DCNN"] + mean_results.tolist())

In [22]:
#{'conv1_out_channels': 64, 'conv2_out_channels': 64, 'dropout_rate': 0.3, 'learning_rate': 0.001, 'num_epochs': 10}

In [23]:
for i in range(1,11):
    startTrain(conv1_out_channels= 64, conv2_out_channels=64, dropout_rate= 0.3, learning_rate= 0.001, num_epochs=10)
    print(i)

1
2
3
4
5
6
7
8
9
10


In [24]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy' , 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df

  return dispatch(args[0].__class__)(*args, **kw)


Algorithm,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp,training_duration,evaluation_duration
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""2DCNN""",0.990786,0.991981,0.998795,0.990221,0.99374,0.99449,0.00626,17691.800781,111.400002,911.799988,92331.601562,101.754005,2.834567
"""2DCNN""",0.990883,0.992117,0.998837,0.990297,0.993937,0.994548,0.006063,17695.599609,107.599998,904.799988,92338.601562,92.115456,1.862941
"""2DCNN""",0.990865,0.991828,0.998703,0.990408,0.993248,0.994538,0.006752,17683.199219,120.0,894.400024,92349.0,97.012299,1.986271
"""2DCNN""",0.990323,0.990966,0.998446,0.990018,0.991915,0.994214,0.008085,17659.400391,143.800003,930.799988,92312.601562,94.151932,1.876042
"""2DCNN""",0.990512,0.990654,0.998243,0.990444,0.990864,0.994328,0.009136,17640.599609,162.600006,891.0,92352.398438,96.819839,1.884552
"""2DCNN""",0.990301,0.991434,0.998672,0.989766,0.993103,0.994199,0.006897,17680.400391,122.800003,954.200012,92289.203125,98.801804,2.03023
"""2DCNN""",0.991099,0.992477,0.998942,0.990449,0.994506,0.994677,0.005494,17705.400391,97.800003,890.599976,92352.796875,101.986427,2.005214
"""2DCNN""",0.990771,0.991712,0.99867,0.990329,0.993094,0.994482,0.006906,17680.199219,123.0,901.799988,92341.601562,116.270119,2.219014
"""2DCNN""",0.990712,0.99144,0.998558,0.990369,0.992511,0.994447,0.007489,17669.800781,133.399994,898.0,92345.398438,105.205467,2.014874
"""2DCNN""",0.990946,0.991942,0.998733,0.990474,0.993409,0.994586,0.006591,17686.0,117.199997,888.200012,92355.203125,103.821091,2.090528


In [25]:
metrics_df.write_csv(f"metrics_results/unbalanced_2DCNN_metrics_output.csv", separator=';')