In [1]:
import polars as pl
import numpy as np
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import KFold
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']             
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars

id.orig_p,id.resp_p,conn_state,history,orig_ip_bytes,label
i32,i32,i64,i64,i64,i32
5526,37215,0,0,40,1
60403,23,2,7,40,1
13386,81,2,7,40,1
36097,37215,0,0,40,1
36097,37215,0,0,40,1
…,…,…,…,…,…
30535,8081,1,1,80,1
36097,37215,0,0,40,1
41258,23,1,1,120,1
36658,23,1,1,120,1


In [9]:
df_polars = df_polars.drop_nulls()

In [10]:
X = df_polars.drop('label')
y = df_polars['label']       

In [11]:
#X = X.to_numpy()
#X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

# Treinamento

In [12]:
results = []

In [13]:
i=5
knn = KNeighborsClassifier(
n_neighbors=i,
metric='euclidean', 
weights='uniform'  
)

In [14]:
def startTrain(X, y, n_neighbors, metric, weights):
    kfold = KFold(n_splits=5, shuffle=True)
    results_fold = []

    for train_idx, test_idx in kfold.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Aplicando a normalização APENAS nos dados de treino
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Aplicando o undersampling no conjunto de treino
        rus = RandomUnderSampler()
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train_scaled, y_train)

        # Criando e treinando o modelo KNN
        knn = KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric, weights=weights)
        
        start_training = time.time() 
        knn.fit(X_train_resampled, y_train_resampled)
        end_training = time.time()

        # Predição e avaliação
        y_pred = knn.predict(X_test_scaled)
        evaluation_duration = time.time() - end_training
        training_duration = end_training - start_training

        confusion = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = confusion.ravel()
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        specificity = tn / (tn + fp)
        f1 = f1_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
        
        training_duration = end_training - start_training
        evaluation_duration = time.time() - end_training
        avaliacao = [accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp, training_duration, evaluation_duration]
        #print(avaliacao)
        results_fold.append(avaliacao)

    results_fold_array = np.array(results_fold, dtype=np.float32)
    mean_results = np.mean(results_fold_array, axis=0)
    results.append(["KNN"] + mean_results.tolist())

In [15]:
#Melhores Hiperparâmetros encontrados: {'n_neighbors': 3, 'metric': 'manhattan', 'weights': 'distance'}

In [16]:
for i in range(1,11):
    startTrain(X=X, y=y, n_neighbors=3, metric='manhattan', weights='distance')
    print(i)

1
2
3
4
5
6
7
8
9
10


In [17]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy' , 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df


  return dispatch(args[0].__class__)(*args, **kw)


Algorithm,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp,training_duration,evaluation_duration
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""KNN""",0.992139,0.994415,0.99957,0.991064,0.997765,0.995299,0.002235,17763.400391,39.799999,833.200012,92410.203125,0.216747,9.834257
"""KNN""",0.992174,0.994499,0.9996,0.991077,0.997921,0.99532,0.002079,17766.199219,37.0,832.0,92411.398438,0.208641,9.635743
"""KNN""",0.99214,0.99442,0.999572,0.991064,0.997776,0.9953,0.002224,17763.599609,39.599998,833.200012,92410.203125,0.207726,9.326642
"""KNN""",0.992219,0.99449,0.999582,0.991148,0.997832,0.995347,0.002168,17764.599609,38.599998,825.400024,92418.0,0.211271,9.507453
"""KNN""",0.99209,0.994299,0.999528,0.991047,0.997551,0.99527,0.002449,17759.599609,43.599998,834.799988,92408.601562,0.214097,9.362147
"""KNN""",0.992146,0.994382,0.999552,0.99109,0.997674,0.995303,0.002326,17761.800781,41.400002,830.799988,92412.601562,0.216285,9.525965
"""KNN""",0.992158,0.994391,0.999552,0.991105,0.997676,0.995311,0.002324,17761.800781,41.400002,829.400024,92414.0,0.216354,9.342478
"""KNN""",0.992167,0.994522,0.999613,0.991056,0.997988,0.995316,0.002012,17767.400391,35.799999,834.0,92409.398438,0.204675,9.185834
"""KNN""",0.992108,0.994363,0.999554,0.991042,0.997684,0.99528,0.002316,17762.0,41.200001,835.200012,92408.203125,0.204985,9.137884
"""KNN""",0.992153,0.994437,0.999576,0.991075,0.997799,0.995307,0.002201,17764.0,39.200001,832.200012,92411.203125,0.202131,9.123503


In [18]:
metrics_df.write_csv(f"metrics_results/unbalanced_knn_metrics_output.csv", separator=';')