In [1]:
import polars as pl
import numpy as np
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import KFold


In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']             
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars = df_polars.drop_nulls()

In [9]:
X = df_polars.drop('label')
y = df_polars['label']       

In [10]:
#X = X.to_numpy()
#X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

# Treinamento

In [11]:
results = []

In [12]:
def startTrain(X, y, criterion, splitter, max_depth, min_samples_split, min_samples_leaf):
    kfold = KFold(n_splits=5, shuffle=True)

    for train_idx, test_idx in kfold.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        start_training = time.time()
        
        dt = DecisionTreeClassifier(
            criterion=criterion, splitter=splitter, max_depth=max_depth, 
            min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        
        dt.fit(X_train, y_train)
        end_training = time.time()

        y_pred = dt.predict(X_test)
        evaluation_time = time.time()


        confusion = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = confusion.ravel()
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        specificity = tn / (tn + fp)
        f1 = f1_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
        training_duration = end_training - start_training
        evaluation_duration = time.time() - end_training

    results.append(['DT', accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp, training_duration, evaluation_duration])

In [13]:
#Melhores Hiperparâmetros encontrados: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2}


In [14]:
for i in range(1,11):
    startTrain(X, y, criterion = 'entropy', splitter= 'best', max_depth = 10, min_samples_split = 2, min_samples_leaf = 2)
    print(i)

1
2
3
4
5
6
7
8
9
10


In [15]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df

  return dispatch(args[0].__class__)(*args, **kw)


Algorithm,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp,training_duration,evaluation_duration
str,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,f64,f64
"""DT""",0.994822,0.996715,0.999903,0.993939,0.99949,0.996912,0.00051,17651,9,566,92820,0.457974,0.087582
"""DT""",0.994858,0.996779,0.999924,0.99395,0.999607,0.996928,0.000393,17816,7,564,92659,0.446966,0.086669
"""DT""",0.995137,0.996989,0.999946,0.994257,0.99972,0.997094,0.00028,17879,5,535,92627,0.487329,0.089254
"""DT""",0.995335,0.997086,0.999935,0.994509,0.999663,0.997215,0.000337,17799,6,512,92729,0.507766,0.089171
"""DT""",0.994813,0.996706,0.999903,0.993918,0.999495,0.996901,0.000505,17813,9,567,92657,0.524621,0.097403
"""DT""",0.995272,0.996868,0.999849,0.994531,0.999205,0.997183,0.000795,17601,14,511,92920,0.856321,0.096952
"""DT""",0.994903,0.996714,0.999881,0.994043,0.999385,0.996954,0.000615,17863,11,555,92617,0.514591,0.090169
"""DT""",0.995092,0.996781,0.99986,0.994289,0.999274,0.997066,0.000726,17887,13,532,92614,0.49883,0.083034
"""DT""",0.994867,0.996833,0.999946,0.99395,0.999717,0.996939,0.000283,17660,5,565,92816,0.486189,0.085214
"""DT""",0.994903,0.996738,0.999892,0.994039,0.999437,0.996957,0.000563,17766,10,556,92714,0.454981,0.079743


In [16]:
metrics_df.write_csv("metrics_results/unbalanced_dt_metrics_output.csv", separator=';')