In [1]:
import polars as pl
import numpy as np
import time
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import KFold
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None

In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']             
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars = df_polars.drop_nulls()

In [9]:
X = df_polars.drop('label')
y = df_polars['label']       

In [10]:
#X = X.to_numpy()
#X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

# Treinamento

In [11]:
results = []

In [12]:
def startTrain(X, y, penalty, C, solver):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    results_fold = []

    for train_idx, test_idx in kfold.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 🔹 Normalização MinMaxScaler
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Aplicando o undersampling no conjunto de treino
        rus = RandomUnderSampler(random_state=42)
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train_scaled, y_train)

        start_training = time.time()

        # Inicializa o modelo de Regressão Logística
        lr = LogisticRegression(penalty=penalty, C=C, solver=solver)
        
        # Treina o modelo
        lr.fit(X_train_resampled, y_train_resampled)
        end_training = time.time()

        # Predição
        y_pred = lr.predict(X_test_scaled)
        evaluation_time = time.time()

        accuracy = accuracy_score(y_test, y_pred)
        results_fold.append(accuracy)

    return np.mean(results_fold)

In [13]:
def grid_search(X, y):
    param_grid = {
        'penalty': ['l1', 'l2', None],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['lbfgs', 'liblinear', 'saga', 'newton-cg']
    }

    best_accuracy = 0
    best_params = {}

    for penalty in param_grid['penalty']:
        for C in param_grid['C']:
            for solver in param_grid['solver']:
                if penalty == 'l1' and solver not in ['liblinear', 'saga']:
                    continue  # L1 só funciona com 'liblinear' e 'saga'
                if penalty == 'elasticnet' and solver != 'saga':
                    continue  # ElasticNet só funciona com 'saga'
                if penalty is None and solver in ['liblinear']:
                    continue  # 'liblinear' requer penalização
                
                accuracy = startTrain(X, y, penalty=penalty, C=C, solver=solver)
                
                print(f"penalty={penalty}, C={C}, solver={solver} -> Accuracy: {accuracy}")

                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params = {'penalty': penalty, 'C': C, 'solver': solver}

    print("\nMelhores Hiperparâmetros encontrados:", best_params)
    print(f"Melhor Acurácia Média: {best_accuracy}")
    return best_params


In [14]:
inicio = time.time() 
best_params = grid_search(X,y)
fim = time.time()

penalty=l1, C=0.01, solver=liblinear -> Accuracy: 0.9746466775879508
penalty=l1, C=0.01, solver=saga -> Accuracy: 0.9777480776157315
penalty=l1, C=0.1, solver=liblinear -> Accuracy: 0.9811430502930405
penalty=l1, C=0.1, solver=saga -> Accuracy: 0.9811646627472168
penalty=l1, C=1, solver=liblinear -> Accuracy: 0.9808584850939315
penalty=l1, C=1, solver=saga -> Accuracy: 0.9808602861493501
penalty=l1, C=10, solver=liblinear -> Accuracy: 0.9809125168213612




penalty=l1, C=10, solver=saga -> Accuracy: 0.9808692914913179
penalty=l2, C=0.01, solver=lbfgs -> Accuracy: 0.9030118906858812
penalty=l2, C=0.01, solver=liblinear -> Accuracy: 0.9029740688464705
penalty=l2, C=0.01, solver=saga -> Accuracy: 0.9030335033184652
penalty=l2, C=0.01, solver=newton-cg -> Accuracy: 0.9030154928778125
penalty=l2, C=0.1, solver=lbfgs -> Accuracy: 0.9548387779023031
penalty=l2, C=0.1, solver=liblinear -> Accuracy: 0.95480635919671
penalty=l2, C=0.1, solver=saga -> Accuracy: 0.9548315738914743
penalty=l2, C=0.1, solver=newton-cg -> Accuracy: 0.9548009560791112
penalty=l2, C=1, solver=lbfgs -> Accuracy: 0.9811934794230673
penalty=l2, C=1, solver=liblinear -> Accuracy: 0.9811934794230673
penalty=l2, C=1, solver=saga -> Accuracy: 0.9811934794230673
penalty=l2, C=1, solver=newton-cg -> Accuracy: 0.981198882556885
penalty=l2, C=10, solver=lbfgs -> Accuracy: 0.980903511284767
penalty=l2, C=10, solver=liblinear -> Accuracy: 0.9809089144510226
penalty=l2, C=10, solver=sa



penalty=None, C=0.01, solver=lbfgs -> Accuracy: 0.9808476787614208




penalty=None, C=0.01, solver=saga -> Accuracy: 0.9808746946251355




penalty=None, C=0.01, solver=newton-cg -> Accuracy: 0.9808512809209142




penalty=None, C=0.1, solver=lbfgs -> Accuracy: 0.9808476787614208




penalty=None, C=0.1, solver=saga -> Accuracy: 0.9808746946251355




penalty=None, C=0.1, solver=newton-cg -> Accuracy: 0.9808512809209142
penalty=None, C=1, solver=lbfgs -> Accuracy: 0.9808476787614208




penalty=None, C=1, solver=saga -> Accuracy: 0.9808746946251355
penalty=None, C=1, solver=newton-cg -> Accuracy: 0.9808512809209142




penalty=None, C=10, solver=lbfgs -> Accuracy: 0.9808476787614208




penalty=None, C=10, solver=saga -> Accuracy: 0.9808746946251355




penalty=None, C=10, solver=newton-cg -> Accuracy: 0.9808512809209142

Melhores Hiperparâmetros encontrados: {'penalty': 'l2', 'C': 1, 'solver': 'newton-cg'}
Melhor Acurácia Média: 0.981198882556885




In [15]:
duracao = fim - inicio
print(duracao)

111.15208268165588


In [16]:
print("\nMelhores Hiperparâmetros encontrados:", best_params)


Melhores Hiperparâmetros encontrados: {'penalty': 'l2', 'C': 1, 'solver': 'newton-cg'}
