In [1]:
import polars as pl
import numpy as np
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score

In [2]:
df_polars = pl.read_parquet('dataset.parquet')

In [3]:
df_polars = df_polars.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']             
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars

id.orig_h,id.orig_p,id.resp_p,conn_state,history,orig_ip_bytes,label
str,i32,i32,i64,i64,i64,i32
"""192.168.100.108""",5526,37215,0,0,40,1
"""192.168.100.111""",60403,23,2,7,40,1
"""192.168.100.111""",13386,81,2,7,40,1
"""192.168.1.198""",36097,37215,0,0,40,1
"""192.168.1.198""",36097,37215,0,0,40,1
…,…,…,…,…,…,…
"""192.168.1.193""",30535,8081,1,1,80,1
"""192.168.1.198""",36097,37215,0,0,40,1
"""192.168.1.200""",41258,23,1,1,120,1
"""192.168.1.200""",36658,23,1,1,120,1


In [9]:
df_polars = df_polars.drop_nulls()

In [10]:
X = df_polars.drop('label')
y = df_polars['label']       

In [11]:
#X = X.to_numpy()
#X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

# Treinamento

In [13]:
results = []

In [14]:
i=5
knn = KNeighborsClassifier(
n_neighbors=i,
metric='euclidean', 
weights='uniform'  
)

In [15]:
def startTrain():

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
    start_training = time.time()
    #print(f"Treinando {algorithm_name} - ", start_training)
    knn.fit(X_train, y_train)
    end_training = time.time()
    #print("Treino Concluído - ", end_training)
    y_pred = knn.predict(X_test)
    evaluation_time = time.time()
    #print("Predição Concluída - ", evaluation_time)        
    training_duration = end_training - start_training
    evaluation_duration = evaluation_time - end_training

    confusion = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = confusion.ravel()
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    specificity = tn / (tn + fp)
    f1 = f1_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

    results.append(["KNN", accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp, training_duration, evaluation_duration])



In [16]:
for i in range(1,31):
    startTrain()
    print(i)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [17]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy' , 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df


  return dispatch(args[0].__class__)(*args, **kw)


Algorithm,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp,training_duration,evaluation_duration
str,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,f64,f64
"""KNN""",0.992292,0.994304,0.999474,0.991342,0.997266,0.995391,0.002734,26632,73,1211,138654,2.61611,419.741776
"""KNN""",0.991835,0.993987,0.999452,0.99082,0.997154,0.995117,0.002846,26629,76,1284,138581,1.979808,360.431304
"""KNN""",0.992057,0.994195,0.999488,0.991049,0.997341,0.99525,0.002659,26634,71,1252,138613,1.740984,388.153681
"""KNN""",0.99264,0.994587,0.99951,0.991721,0.997454,0.9956,0.002546,26637,68,1158,138707,1.705734,335.069819
"""KNN""",0.992015,0.99414,0.999474,0.991013,0.997266,0.995225,0.002734,26632,73,1257,138608,2.443218,314.853468
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""KNN""",0.991979,0.993952,0.999394,0.991049,0.996855,0.995204,0.003145,26621,84,1252,138613,1.188691,351.071561
"""KNN""",0.992111,0.994091,0.999423,0.991177,0.997004,0.995283,0.002996,26625,80,1234,138631,1.99171,359.256407
"""KNN""",0.991967,0.994369,0.999596,0.990834,0.997903,0.995196,0.002097,26649,56,1282,138583,1.489332,346.593317
"""KNN""",0.992418,0.994228,0.999402,0.991563,0.996892,0.995467,0.003108,26622,83,1180,138685,1.630551,361.444154


In [18]:
metrics_df.write_csv(f"metrics_results/unbalanced_knn_metrics_output.csv", separator=';')