In [1]:
import polars as pl
import numpy as np
import time
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import KFold


In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0),
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']             
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars

id.orig_p,id.resp_p,conn_state,history,orig_ip_bytes,label
i32,i32,i64,i64,i64,i32
5526,37215,0,0,40,1
60403,23,2,7,40,1
13386,81,2,7,40,1
36097,37215,0,0,40,1
36097,37215,0,0,40,1
…,…,…,…,…,…
30535,8081,1,1,80,1
36097,37215,0,0,40,1
41258,23,1,1,120,1
36658,23,1,1,120,1


In [9]:
df_polars = df_polars.drop_nulls()

In [10]:
X = df_polars.drop('label')
y = df_polars['label']       

In [11]:
#X = X.to_numpy()
#X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

# Treinamento

In [12]:
results = []

In [16]:
def startTrain(X, y, model):
 
    kfold = KFold(n_splits=5, shuffle=True)
    for train_idx, test_idx in kfold.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Normalizar os dados
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        start_training = time.time()
        model.fit(X_train, y_train)
        end_training = time.time()
        
        y_pred = model.predict(X_test)
        evaluation_time = time.time()
        #print("Predição Concluída - ", evaluation_time)        
        training_duration = end_training - start_training
        evaluation_duration = evaluation_time - end_training
        
        confusion = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = confusion.ravel()
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        specificity = tn / (tn + fp)
        f1 = f1_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
        
        results.append(["BernoulliNB", accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp, training_duration, evaluation_duration])

In [14]:
#Modelo: BernoulliNB - Parâmetros: {'alpha': 0.1, 'binarize': 0.1}  

In [17]:
for i in range(1,11):
    model = BernoulliNB(alpha=0.1, binarize=0.1)
    startTrain(X,y, model=model)
    print(i)

1
2
3
4
5
6
7
8
9
10


In [17]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df

  return dispatch(args[0].__class__)(*args, **kw)


Algorithm,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp,training_duration,evaluation_duration
str,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,f64,f64
"""GaussianNB""",0.979096,0.964557,0.989112,0.985958,0.943157,0.987532,0.056843,25187,1518,1964,137901,0.06544,0.031098
"""GaussianNB""",0.977187,0.960148,0.987573,0.985229,0.935068,0.986399,0.064932,24971,1734,2066,137799,0.059975,0.027201
"""GaussianNB""",0.97885,0.964002,0.988919,0.985858,0.942146,0.987386,0.057854,25160,1545,1978,137887,0.064905,0.034008
"""GaussianNB""",0.977097,0.960292,0.987662,0.985028,0.935555,0.986344,0.064445,24984,1721,2094,137771,0.069125,0.028981
"""GaussianNB""",0.977391,0.960391,0.987632,0.985415,0.935368,0.986522,0.064632,24979,1726,2040,137825,0.06211,0.030894
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""GaussianNB""",0.978466,0.963561,0.988816,0.9855,0.941621,0.987155,0.058379,25146,1559,2028,137837,0.061552,0.028136
"""GaussianNB""",0.977571,0.961635,0.988159,0.985093,0.938176,0.986624,0.061824,25054,1651,2085,137780,0.057944,0.031239
"""GaussianNB""",0.839191,0.499726,0.839603,0.999414,0.000037,0.912565,0.999963,1,26704,82,139783,0.06308,0.03183
"""GaussianNB""",0.978267,0.963261,0.988729,0.98535,0.941172,0.987037,0.058828,25134,1571,2049,137816,0.058682,0.028952


In [18]:
metrics_df.write_csv("metrics_results/unbalanced_bayes_metrics_output.csv", separator=';')