In [1]:
import polars as pl
import numpy as np
import time
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import KFold


In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0),
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']             
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars

id.orig_p,id.resp_p,conn_state,history,orig_ip_bytes,label
i32,i32,i64,i64,i64,i32
5526,37215,0,0,40,1
60403,23,2,7,40,1
13386,81,2,7,40,1
36097,37215,0,0,40,1
36097,37215,0,0,40,1
…,…,…,…,…,…
30535,8081,1,1,80,1
36097,37215,0,0,40,1
41258,23,1,1,120,1
36658,23,1,1,120,1


In [9]:
df_polars = df_polars.drop_nulls()

In [10]:
X = df_polars.drop('label')
y = df_polars['label']       

In [11]:
#X = X.to_numpy()
#X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

# Treinamento

In [12]:
results = []

In [13]:
def startTrain(X, y, algorithm, params, kfold):
    results_fold = []

    # K-Fold Cross Validation
    for train_idx, test_idx in kfold.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Normalizar os dados
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Definir o modelo com os parâmetros fornecidos
        model = algorithm.set_params(**params)
        model.fit(X_train, y_train)

        # Fazer previsões
        y_pred = model.predict(X_test)

        # Avaliar o modelo
        accuracy = accuracy_score(y_test, y_pred)
        results_fold.append(accuracy)

    # Acurácia média após K-Fold
    return np.mean(results_fold)

In [14]:
def grid_search(X, y):
    algorithms = [
        GaussianNB(),
        MultinomialNB(),
        BernoulliNB(),
        ComplementNB()
    ]
    algorithm_names = ['GaussianNB', 'MultinomialNB', 'BernoulliNB', 'ComplementNB']
    
    param_grid = {
        'GaussianNB': {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]},
        'MultinomialNB': {'alpha': [0.1, 0.5, 1.0, 2.0]},
        'BernoulliNB': {'alpha': [0.1, 0.5, 1.0, 2.0], 'binarize': [0.0, 0.1, 0.5]},
        'ComplementNB': {'alpha': [0.1, 0.5, 1.0, 2.0]}
    }

    best_accuracy = 0
    best_params = {}

    kfold = KFold(n_splits=5, shuffle=True)

    # Loop por cada modelo
    for algorithm, algorithm_name in zip(algorithms, algorithm_names):
        print(f"\nExecutando Grid Search para {algorithm_name}...")
        
        best_model_params = None
        best_model_score = -np.inf

        # Realizar Grid Search manualmente
        for params in [dict(zip(param_grid[algorithm_name].keys(), v)) for v in np.array(np.meshgrid(*param_grid[algorithm_name].values())).T.reshape(-1, len(param_grid[algorithm_name]))]:
            accuracy = startTrain(X, y, algorithm, params, kfold)

            print(f"Parâmetros: {params} -> Acurácia: {accuracy}")

            if accuracy > best_model_score:
                best_model_score = accuracy
                best_model_params = params

        # Atualizar melhores parâmetros
        if best_model_score > best_accuracy:
            best_accuracy = best_model_score
            best_params = {
                'model': algorithm_name,
                'params': best_model_params
            }

    # Exibir os melhores resultados
    print("\nMelhores Hiperparâmetros encontrados:")
    print(f"Modelo: {best_params['model']} - Parâmetros: {best_params['params']}")
    print(f"Melhor Acurácia Média: {best_accuracy}")

    return best_params


In [15]:
inicio = time.time()
best_params = grid_search(X, y)
fim = time.time()


Executando Grid Search para GaussianNB...
Parâmetros: {'var_smoothing': 1e-09} -> Acurácia: 0.885543784803491
Parâmetros: {'var_smoothing': 1e-08} -> Acurácia: 0.9366302663554725
Parâmetros: {'var_smoothing': 1e-07} -> Acurácia: 0.9476994754961598
Parâmetros: {'var_smoothing': 1e-06} -> Acurácia: 0.9796229667210199

Executando Grid Search para MultinomialNB...
Parâmetros: {'alpha': 0.1} -> Acurácia: 0.8396763147976749
Parâmetros: {'alpha': 0.5} -> Acurácia: 0.8396763147976749
Parâmetros: {'alpha': 1.0} -> Acurácia: 0.8396763147976749
Parâmetros: {'alpha': 2.0} -> Acurácia: 0.8396763147976749

Executando Grid Search para BernoulliNB...
Parâmetros: {'alpha': 0.1, 'binarize': 0.0} -> Acurácia: 0.8398492151933736
Parâmetros: {'alpha': 0.1, 'binarize': 0.1} -> Acurácia: 0.9818076370176897
Parâmetros: {'alpha': 0.1, 'binarize': 0.5} -> Acurácia: 0.8396709116800759
Parâmetros: {'alpha': 0.5, 'binarize': 0.0} -> Acurácia: 0.8396781158530933
Parâmetros: {'alpha': 0.5, 'binarize': 0.1} -> Acurá

In [16]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df

Algorithm,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp,training_duration,evaluation_duration
null,null,null,null,null,null,null,null,null,null,null,null,null,null


In [17]:
metrics_df.write_csv("metrics_results/unbalanced_bayes_metrics_output.csv", separator=';')

FileNotFoundError: O sistema não pode encontrar o caminho especificado. (os error 3): metrics_results/unbalanced_bayes_metrics_output.csv