In [1]:
import polars as pl
import numpy as np
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import KFold


In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']             
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars

id.orig_p,id.resp_p,conn_state,history,orig_ip_bytes,label
i32,i32,i64,i64,i64,i32
5526,37215,0,0,40,1
60403,23,2,7,40,1
13386,81,2,7,40,1
36097,37215,0,0,40,1
36097,37215,0,0,40,1
…,…,…,…,…,…
30535,8081,1,1,80,1
36097,37215,0,0,40,1
41258,23,1,1,120,1
36658,23,1,1,120,1


In [9]:
df_polars = df_polars.drop_nulls()

In [10]:
X = df_polars.drop('label')
y = df_polars['label']       

In [11]:
#X = X.to_numpy()
#X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

# Treinamento

In [12]:
results = []

In [None]:
def startTrain(X, y, n_neighbors, metric, weights):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    results_fold = []

    for train_idx, test_idx in kfold.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        knn = KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric, weights=weights)

        start_training = time.time() 
        knn.fit(X_train_scaled, y_train)
        end_training = time.time()

        y_pred = knn.predict(X_test_scaled)
        evaluation_time = time.time()

        accuracy = accuracy_score(y_test, y_pred)
        results_fold.append(accuracy)

    return np.mean(results_fold)

In [None]:
def grid_search():
    param_grid = {
        'n_neighbors': [3, 5, 7, 9, 11],
        'metric': ['euclidean', 'manhattan', 'minkowski'],
        'weights': ['uniform', 'distance']
    }

    best_accuracy = 0
    best_params = {}

    for n_neighbors in param_grid['n_neighbors']:
        for metric in param_grid['metric']:
            for weights in param_grid['weights']:
                
                accuracy = startTrain(X, y, n_neighbors=n_neighbors, metric=metric, weights=weights)
                
                print(f"n_neighbors={n_neighbors}, metric={metric}, weights={weights} -> Accuracy: {accuracy}")

                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params = {
                        'n_neighbors': n_neighbors,
                        'metric': metric,
                        'weights': weights
                    }

    print("\nMelhores Hiperparâmetros encontrados:", best_params)
    print(f"Melhor Acurácia Média: {best_accuracy}")
    return best_params

In [15]:
inicio = time.time()
best_params = grid_search()
fim = time.time()

n_neighbors=3, metric=euclidean, weights=uniform -> Accuracy: 0.9918754819385729
n_neighbors=3, metric=euclidean, weights=distance -> Accuracy: 0.9919403195443854
n_neighbors=3, metric=manhattan, weights=uniform -> Accuracy: 0.9920447803694044
n_neighbors=3, metric=manhattan, weights=distance -> Accuracy: 0.9920015551691117
n_neighbors=3, metric=minkowski, weights=uniform -> Accuracy: 0.9918754819385729
n_neighbors=3, metric=minkowski, weights=distance -> Accuracy: 0.9919403195443854
n_neighbors=5, metric=euclidean, weights=uniform -> Accuracy: 0.9921672517323892
n_neighbors=5, metric=euclidean, weights=distance -> Accuracy: 0.9919223093307966
n_neighbors=5, metric=manhattan, weights=uniform -> Accuracy: 0.9922104767218366
n_neighbors=5, metric=manhattan, weights=distance -> Accuracy: 0.9919601312513018
n_neighbors=5, metric=minkowski, weights=uniform -> Accuracy: 0.9921672517323892
n_neighbors=5, metric=minkowski, weights=distance -> Accuracy: 0.9919223093307966
n_neighbors=7, metric=

In [16]:
duracao = fim - inicio
print(duracao)

20498.069157600403


In [17]:
print("\nMelhores Hiperparâmetros encontrados:", best_params)


Melhores Hiperparâmetros encontrados: {'n_neighbors': 5, 'metric': 'manhattan', 'weights': 'uniform'}
