In [1]:
import polars as pl
import numpy as np
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import KFold
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']             
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars = df_polars.drop_nulls()

In [9]:
X = df_polars.drop('label')
y = df_polars['label']       

In [10]:
#X = X.to_numpy()
#X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

# Treinamento

In [11]:
results = []

In [12]:
def startTrain(X, y, criterion, bootstrap, max_depth, min_samples_split, n_estimators):
    kfold = KFold(n_splits=5, shuffle=True)
    results_fold = []

    for train_idx, test_idx in kfold.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        rus = RandomUnderSampler()
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train_scaled, y_train)

        rf = RandomForestClassifier(
            criterion=criterion, bootstrap=bootstrap, max_depth=max_depth,
            min_samples_split=min_samples_split, n_estimators=n_estimators
        )

        start_training = time.time()
        rf.fit(X_train_resampled, y_train_resampled)
        end_training = time.time()

        y_pred = rf.predict(X_test_scaled)
        training_duration = end_training - start_training
        evaluation_duration = time.time() - end_training

        confusion = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = confusion.ravel()
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        specificity = tn / (tn + fp)
        f1 = f1_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
        

        avaliacao = [accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp, training_duration, evaluation_duration]
        #print(avaliacao)
        results_fold.append(avaliacao)

    results_fold_array = np.array(results_fold, dtype=np.float32)
    mean_results = np.mean(results_fold_array, axis=0)
    results.append(["rf"] + mean_results.tolist())

In [13]:
#Melhores Hiperparâmetros encontrados: {'criterion': 'entropy', 'bootstrap': False, 'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}

In [14]:
for i in range(1,11):
    startTrain(X, y, criterion='entropy',bootstrap=False,max_depth=10, min_samples_split=2, n_estimators=200)
    print(i)

1
2
3
4
5
6
7
8
9
10


In [15]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df

  return dispatch(args[0].__class__)(*args, **kw)


Algorithm,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp,training_duration,evaluation_duration
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""rf""",0.994981,0.996811,0.999905,0.994116,0.999506,0.997002,0.000494,17794.400391,8.8,548.599976,92694.796875,15.203085,0.653919
"""rf""",0.994963,0.996796,0.999903,0.994097,0.999495,0.996991,0.000505,17794.199219,9.0,550.400024,92693.0,14.741933,0.636688
"""rf""",0.994984,0.996795,0.999896,0.994129,0.999461,0.997004,0.000539,17793.599609,9.6,547.400024,92696.0,14.651149,0.634556
"""rf""",0.994975,0.996763,0.999883,0.994132,0.999394,0.996999,0.000606,17792.400391,10.8,547.200012,92696.203125,15.131909,0.664929
"""rf""",0.994939,0.996773,0.999899,0.994074,0.999472,0.996978,0.000528,17793.800781,9.4,552.599976,92690.796875,15.043367,0.643924
"""rf""",0.994993,0.996846,0.999918,0.994119,0.999573,0.99701,0.000427,17795.599609,7.6,548.400024,92695.0,15.080119,0.650719
"""rf""",0.994971,0.996819,0.999911,0.994099,0.999539,0.996997,0.000461,17795.0,8.2,550.200012,92693.203125,15.364093,0.644312
"""rf""",0.994998,0.996844,0.999916,0.994127,0.999562,0.997013,0.000438,17795.400391,7.8,547.599976,92695.796875,14.956586,0.640092
"""rf""",0.994984,0.996818,0.999907,0.994119,0.999517,0.997005,0.000483,17794.599609,8.6,548.400024,92695.0,15.044266,0.651638
"""rf""",0.994964,0.996811,0.999909,0.994093,0.999529,0.996992,0.000471,17794.800781,8.4,550.799988,92692.601562,17.935316,0.702553


In [16]:
metrics_df.write_csv("metrics_results/balanced_rf_metrics_output.csv", separator=';')