In [1]:
import polars as pl
import numpy as np
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import KFold
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']             
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars = df_polars.drop_nulls()

In [9]:
X = df_polars.drop('label')
y = df_polars['label']       

In [10]:
#X = X.to_numpy()
#X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

# Treinamento

In [11]:
results = []

In [12]:
def startTrain(X, y, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    results_fold = []

    for train_idx, test_idx in kfold.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        rus = RandomUnderSampler(random_state=42)
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train_scaled, y_train)
        
        dt = DecisionTreeClassifier(
            criterion=criterion, splitter=splitter, max_depth=max_depth, 
            min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        
        dt.fit(X_train_resampled, y_train_resampled)
        y_pred = dt.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        results_fold.append(accuracy)

    return np.mean(results_fold)

In [13]:
from itertools import product

def grid_search(X, y):
    param_grid = {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': [None, 5, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5]
    }

    best_accuracy = 0
    best_params = {}

    # Gerando todas as combinações de hiperparâmetros
    for criterion, splitter, max_depth, min_samples_split, min_samples_leaf in product(
        param_grid['criterion'], param_grid['splitter'], param_grid['max_depth'], param_grid['min_samples_split'], param_grid['min_samples_leaf']
    ):
        accuracy = startTrain(X, y, criterion=criterion, splitter=splitter, max_depth=max_depth, 
                              min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        
        print(f"criterion={criterion}, splitter={splitter}, max_depth={max_depth}, min_samples_split={min_samples_split}, min_samples_leaf={min_samples_leaf} -> Accuracy: {accuracy}")

        # Atualiza os melhores hiperparâmetros caso a acurácia seja maior
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = {
                'criterion': criterion,
                'splitter': splitter,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf
            }

    print("\nMelhores Hiperparâmetros encontrados:", best_params)
    print(f"Melhor Acurácia Média: {best_accuracy}")
    return best_params


In [14]:
inicio = time.time()
best_params = grid_search(X, y)
fim = time.time()

criterion=gini, splitter=best, max_depth=None, min_samples_split=2, min_samples_leaf=1 -> Accuracy: 0.9945374282394581
criterion=gini, splitter=best, max_depth=None, min_samples_split=2, min_samples_leaf=2 -> Accuracy: 0.9947193335392155
criterion=gini, splitter=best, max_depth=None, min_samples_split=2, min_samples_leaf=5 -> Accuracy: 0.9946761086470813
criterion=gini, splitter=best, max_depth=None, min_samples_split=5, min_samples_leaf=1 -> Accuracy: 0.9945068105244081
criterion=gini, splitter=best, max_depth=None, min_samples_split=5, min_samples_leaf=2 -> Accuracy: 0.9946544961442483
criterion=gini, splitter=best, max_depth=None, min_samples_split=5, min_samples_leaf=5 -> Accuracy: 0.9946779096538432
criterion=gini, splitter=best, max_depth=None, min_samples_split=10, min_samples_leaf=1 -> Accuracy: 0.9944635854376476
criterion=gini, splitter=best, max_depth=None, min_samples_split=10, min_samples_leaf=2 -> Accuracy: 0.9946743075430062
criterion=gini, splitter=best, max_depth=None,

In [15]:
duracao = fim - inicio
print(duracao)

167.86991381645203


In [16]:
print("\nMelhores Hiperparâmetros encontrados:", best_params)


Melhores Hiperparâmetros encontrados: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}
