In [1]:
import polars as pl
import numpy as np
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score

In [2]:
df_polars = pl.read_parquet('dataset.parquet')

In [3]:
df_polars = df_polars.sample(fraction=0.01, seed=42)

In [4]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [5]:
df_polars = df_polars.drop(["ts", "uid", "id.orig_h", "id.resp_h", "local_orig", "local_resp", "missed_bytes" , "tunnel_parents", "detailed-label", "__index_level_0__"])

In [6]:
X = df_polars.drop('label')
y = df_polars['label']       

In [7]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Treinamento

In [9]:
dt = DecisionTreeClassifier (criterion = 'entropy', max_depth = 10,
                              min_samples_split = 2, splitter = 'best')

print(datetime.now())
dt.fit(X_train, y_train)
print("Treino Concluído")
print(datetime.now())

y_pred = dt.predict(X_test)
print("Predição Concluída")
confusion = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = confusion.ravel()
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
specificity = tn / (tn + fp)
f1 = f1_score(y_test, y_pred)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

results = [[accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp]]

2024-09-16 10:33:11.253981
Treino Concluído
2024-09-16 10:33:11.880123
Predição Concluída


In [10]:
metrics_df = pl.DataFrame(
    results,
    schema=['Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp']
)
metrics_df

  return dispatch(args[0].__class__)(*args, **kw)


Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp
f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64
0.996506,0.997662,0.999878,0.99596,0.999363,0.997915,0.000637,26688,17,565,139300
