In [1]:
import polars as pl
import numpy as np
import time
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import KFold
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None


In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']             
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars = df_polars.drop_nulls()

In [9]:
X = df_polars.drop('label')
y = df_polars['label']       

In [10]:
#X = X.to_numpy()
#X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

In [11]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Treinamento

In [12]:
results = []

In [13]:
def startTrain(X, y, hidden_layer_sizes, activation, alpha, max_iter):
    kfold = KFold(n_splits=5, shuffle=True)
    results_fold = []
    
    for train_idx, test_idx in kfold.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        #rus = RandomUnderSampler()
        #X_train_resampled, y_train_resampled = rus.fit_resample(X_train_scaled, y_train)
        
        mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver='adam', alpha=alpha, max_iter=max_iter)
    
        start_training = time.time()
        mlp.fit(X_train_scaled, y_train)
        end_training = time.time()

        y_pred = mlp.predict(X_test_scaled)
        training_duration = end_training - start_training
        evaluation_duration = time.time() - end_training

        confusion = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = confusion.ravel()
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        specificity = tn / (tn + fp)
        f1 = f1_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
        

        avaliacao = [accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp, training_duration, evaluation_duration]
        #print(avaliacao)
        results_fold.append(avaliacao)

    results_fold_array = np.array(results_fold, dtype=np.float32)
    mean_results = np.mean(results_fold_array, axis=0)
    results.append(["MLP"] + mean_results.tolist())

In [14]:
#Melhores Hiperparâmetros encontrados: {'hidden_layer_sizes': (128, 64, 32), 'activation': 'relu', 'alpha': 0.0001, 'max_iter': 300}

In [15]:
for i in range(1,11):
    startTrain(X=X, y=y, hidden_layer_sizes=(128,64,32), activation='relu', alpha=0.0001, max_iter=300)
    print(i)

1
2
3
4
5
6
7
8
9
10


In [16]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df

  return dispatch(args[0].__class__)(*args, **kw)


Algorithm,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp,training_duration,evaluation_duration
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""MLP""",0.99216,0.994435,0.999574,0.991086,0.997785,0.995312,0.002215,17763.800781,39.400002,831.200012,92412.203125,138.915176,0.287304
"""MLP""",0.99207,0.993993,0.999388,0.991163,0.996823,0.995258,0.003177,17746.599609,56.599998,824.0,92419.398438,93.999474,0.294739
"""MLP""",0.992092,0.994214,0.999487,0.99109,0.997339,0.995271,0.002661,17755.800781,47.400002,830.799988,92412.601562,112.110008,0.302916
"""MLP""",0.992081,0.99438,0.99957,0.990996,0.997765,0.995264,0.002235,17763.400391,39.799999,839.599976,92403.796875,123.359055,0.295878
"""MLP""",0.992072,0.994192,0.999483,0.99107,0.997313,0.995259,0.002686,17755.400391,47.799999,832.599976,92410.796875,126.502357,0.289908
"""MLP""",0.992122,0.994259,0.9995,0.991114,0.997405,0.995289,0.002595,17757.0,46.200001,828.599976,92414.796875,108.114037,0.209203
"""MLP""",0.99211,0.994202,0.999477,0.991122,0.997282,0.995282,0.002718,17754.800781,48.400002,827.799988,92415.601562,84.11763,0.161317
"""MLP""",0.992072,0.99418,0.999477,0.991077,0.997282,0.995259,0.002718,17754.800781,48.400002,832.0,92411.398438,65.27034,0.162234
"""MLP""",0.992088,0.994185,0.999474,0.991098,0.997272,0.995269,0.002728,17754.599609,48.599998,830.0,92413.398438,68.261528,0.144467
"""MLP""",0.992086,0.994095,0.999431,0.99114,0.99705,0.995268,0.00295,17750.599609,52.599998,826.200012,92417.203125,76.341675,0.13663


In [17]:
metrics_df.write_csv("metrics_results/unbalanced_mlp_metrics_output.csv", separator=';')