In [1]:
import polars as pl
import numpy as np
import time
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import KFold
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None


In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']             
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars = df_polars.drop_nulls()

In [9]:
X = df_polars.drop('label')
y = df_polars['label']       

In [10]:
#X = X.to_numpy()
#X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

In [11]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Treinamento

In [12]:
results = []

In [13]:
def startTrain(X, y, hidden_layer_sizes, activation, alpha, max_iter):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    results_fold = []
    
    for train_idx, test_idx in kfold.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        rus = RandomUnderSampler(random_state=42)
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train_scaled, y_train)
        
        mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver='adam', alpha=alpha, max_iter=max_iter)
    
        start_training = time.time()
        mlp.fit(X_train_resampled, y_train_resampled)
        end_training = time.time()

        y_pred = mlp.predict(X_test_scaled)
        evaluation_time = time.time()

        accuracy = accuracy_score(y_test, y_pred)
        results_fold.append(accuracy)

    return np.mean(results_fold)

In [14]:
def grid_search():
    param_grid = {
        'hidden_layer_sizes': [(64, 32, 16), (128, 64, 32)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001],
        'max_iter': [10, 50, 100, 200, 300] 
    }
    
    best_accuracy = 0
    best_params = {}
    
    for hidden_layer_sizes in param_grid['hidden_layer_sizes']:
        for activation in param_grid['activation']:
            for alpha in param_grid['alpha']:
                for max_iter in param_grid['max_iter']:  
                    accuracy = startTrain(X, y, hidden_layer_sizes=hidden_layer_sizes, activation=activation, alpha=alpha, max_iter=max_iter)
                    print(f"hidden_layer_sizes={hidden_layer_sizes}, activation={activation}, alpha={alpha}, max_iter={max_iter} -> Accuracy: {accuracy}")
                    
                    if accuracy > best_accuracy:
                        best_accuracy = accuracy
                        best_params = {'hidden_layer_sizes': hidden_layer_sizes, 'activation': activation, 'alpha': alpha, 'max_iter': max_iter}
    
    print("\nMelhores Hiperparâmetros encontrados:", best_params)
    print(f"Melhor Acurácia Média: {best_accuracy}")
    return best_params

In [15]:
inicio = time.time()
best_params = grid_search()
fim = time.time()



hidden_layer_sizes=(64, 32, 16), activation=relu, alpha=0.0001, max_iter=10 -> Accuracy: 0.9900492216863983




hidden_layer_sizes=(64, 32, 16), activation=relu, alpha=0.0001, max_iter=50 -> Accuracy: 0.9911748749813721
hidden_layer_sizes=(64, 32, 16), activation=relu, alpha=0.0001, max_iter=100 -> Accuracy: 0.9913117552256141
hidden_layer_sizes=(64, 32, 16), activation=relu, alpha=0.0001, max_iter=200 -> Accuracy: 0.9915332840202986
hidden_layer_sizes=(64, 32, 16), activation=relu, alpha=0.0001, max_iter=300 -> Accuracy: 0.9914198185669415




hidden_layer_sizes=(64, 32, 16), activation=relu, alpha=0.001, max_iter=10 -> Accuracy: 0.9894566785733095




hidden_layer_sizes=(64, 32, 16), activation=relu, alpha=0.001, max_iter=50 -> Accuracy: 0.9908308773376258
hidden_layer_sizes=(64, 32, 16), activation=relu, alpha=0.001, max_iter=100 -> Accuracy: 0.9913964035327739
hidden_layer_sizes=(64, 32, 16), activation=relu, alpha=0.001, max_iter=200 -> Accuracy: 0.9913603844031048
hidden_layer_sizes=(64, 32, 16), activation=relu, alpha=0.001, max_iter=300 -> Accuracy: 0.9913964047005315




hidden_layer_sizes=(64, 32, 16), activation=tanh, alpha=0.0001, max_iter=10 -> Accuracy: 0.987951005471994




hidden_layer_sizes=(64, 32, 16), activation=tanh, alpha=0.0001, max_iter=50 -> Accuracy: 0.9912343105400305




hidden_layer_sizes=(64, 32, 16), activation=tanh, alpha=0.0001, max_iter=100 -> Accuracy: 0.9915080693093152
hidden_layer_sizes=(64, 32, 16), activation=tanh, alpha=0.0001, max_iter=200 -> Accuracy: 0.9914288227087138
hidden_layer_sizes=(64, 32, 16), activation=tanh, alpha=0.0001, max_iter=300 -> Accuracy: 0.9914954618240728




hidden_layer_sizes=(64, 32, 16), activation=tanh, alpha=0.001, max_iter=10 -> Accuracy: 0.9877240718405119




hidden_layer_sizes=(64, 32, 16), activation=tanh, alpha=0.001, max_iter=50 -> Accuracy: 0.991254120981876




hidden_layer_sizes=(64, 32, 16), activation=tanh, alpha=0.001, max_iter=100 -> Accuracy: 0.9912775342481881
hidden_layer_sizes=(64, 32, 16), activation=tanh, alpha=0.001, max_iter=200 -> Accuracy: 0.9914990625238692
hidden_layer_sizes=(64, 32, 16), activation=tanh, alpha=0.001, max_iter=300 -> Accuracy: 0.9912919435997921




hidden_layer_sizes=(128, 64, 32), activation=relu, alpha=0.0001, max_iter=10 -> Accuracy: 0.9906903967341835




hidden_layer_sizes=(128, 64, 32), activation=relu, alpha=0.0001, max_iter=50 -> Accuracy: 0.9915224782392288
hidden_layer_sizes=(128, 64, 32), activation=relu, alpha=0.0001, max_iter=100 -> Accuracy: 0.9917097860402686
hidden_layer_sizes=(128, 64, 32), activation=relu, alpha=0.0001, max_iter=200 -> Accuracy: 0.9917530126353828
hidden_layer_sizes=(128, 64, 32), activation=relu, alpha=0.0001, max_iter=300 -> Accuracy: 0.9918394615924896




hidden_layer_sizes=(128, 64, 32), activation=relu, alpha=0.001, max_iter=10 -> Accuracy: 0.9905751257490033




hidden_layer_sizes=(128, 64, 32), activation=relu, alpha=0.001, max_iter=50 -> Accuracy: 0.9913910001718922
hidden_layer_sizes=(128, 64, 32), activation=relu, alpha=0.001, max_iter=100 -> Accuracy: 0.9914342258425315
hidden_layer_sizes=(128, 64, 32), activation=relu, alpha=0.001, max_iter=200 -> Accuracy: 0.9915152730768613
hidden_layer_sizes=(128, 64, 32), activation=relu, alpha=0.001, max_iter=300 -> Accuracy: 0.99153508517303




hidden_layer_sizes=(128, 64, 32), activation=tanh, alpha=0.0001, max_iter=10 -> Accuracy: 0.9888191079718768




hidden_layer_sizes=(128, 64, 32), activation=tanh, alpha=0.0001, max_iter=50 -> Accuracy: 0.9915548963609432
hidden_layer_sizes=(128, 64, 32), activation=tanh, alpha=0.0001, max_iter=100 -> Accuracy: 0.9914504355034864
hidden_layer_sizes=(128, 64, 32), activation=tanh, alpha=0.0001, max_iter=200 -> Accuracy: 0.9914810538024152
hidden_layer_sizes=(128, 64, 32), activation=tanh, alpha=0.0001, max_iter=300 -> Accuracy: 0.9915765098369077




hidden_layer_sizes=(128, 64, 32), activation=tanh, alpha=0.001, max_iter=10 -> Accuracy: 0.9877294731253798




hidden_layer_sizes=(128, 64, 32), activation=tanh, alpha=0.001, max_iter=50 -> Accuracy: 0.9912559228320184
hidden_layer_sizes=(128, 64, 32), activation=tanh, alpha=0.001, max_iter=100 -> Accuracy: 0.9914414304696768
hidden_layer_sizes=(128, 64, 32), activation=tanh, alpha=0.001, max_iter=200 -> Accuracy: 0.991472048184727
hidden_layer_sizes=(128, 64, 32), activation=tanh, alpha=0.001, max_iter=300 -> Accuracy: 0.991410812722189

Melhores Hiperparâmetros encontrados: {'hidden_layer_sizes': (128, 64, 32), 'activation': 'relu', 'alpha': 0.0001, 'max_iter': 300}
Melhor Acurácia Média: 0.9918394615924896


In [16]:
duracao = fim - inicio
print(duracao)

7640.104868173599


In [17]:
print("\nMelhores Hiperparâmetros encontrados:", best_params)


Melhores Hiperparâmetros encontrados: {'hidden_layer_sizes': (128, 64, 32), 'activation': 'relu', 'alpha': 0.0001, 'max_iter': 300}
