In [1]:
import polars as pl
import numpy as np
import time
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import KFold
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None


In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']             
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars = df_polars.drop_nulls()

In [9]:
X = df_polars.drop('label')
y = df_polars['label']       

In [10]:
#X = X.to_numpy()
#X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

# Treinamento

In [11]:
results = []

In [12]:
def startTrain(X, y, kernel='rbf', C=1, gamma='scale', tol=0.001, max_iter=5000):    
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    results_fold = []

    for train_idx, test_idx in kfold.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        rus = RandomUnderSampler(random_state=42)
        X_train_resampled, y_train_resampled = rus.fit_resample(X_train_scaled, y_train)

        svm = SVC(kernel=kernel, C=C, gamma=gamma, tol=tol, max_iter=max_iter)

        svm.fit(X_train_resampled, y_train_resampled)
        y_pred = svm.predict(X_test_scaled)

        accuracy = accuracy_score(y_test, y_pred)
        results_fold.append(accuracy)

    return np.mean(results_fold)

In [13]:
def grid_search(X, y):
    param_grid = {
        'kernel': ['rbf'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.01, 0.1, 1],
        'tol': [0.001, 0.0001],
        'max_iter': [5000, 10000]
    }

    best_accuracy = 0
    best_params = {}

    for kernel in param_grid['kernel']:
        for C in param_grid['C']:
            for gamma in param_grid['gamma']:
                for tol in param_grid['tol']:
                    for max_iter in param_grid['max_iter']:
                        
                        accuracy = startTrain(X, y, kernel=kernel, C=C, gamma=gamma, tol=tol, max_iter=max_iter)

                        print(f"kernel={kernel}, C={C}, gamma={gamma}, tol={tol}, max_iter={max_iter} -> Accuracy: {accuracy}")

                        if accuracy > best_accuracy:
                            best_accuracy = accuracy
                            best_params = {
                                'kernel': kernel,
                                'C': C,
                                'gamma': gamma,
                                'tol': tol,
                                'max_iter': max_iter
                            }

    print("\nMelhores Hiperparâmetros encontrados:", best_params)
    print(f"Melhor Acurácia Média: {best_accuracy}")
    return best_params

In [15]:
inicio = time.time()
best_params = grid_search(X,y)
fim = time.time()



kernel=rbf, C=0.1, gamma=scale, tol=0.001, max_iter=5000 -> Accuracy: 0.9854385445488587




kernel=rbf, C=0.1, gamma=scale, tol=0.001, max_iter=10000 -> Accuracy: 0.9854241361217297




kernel=rbf, C=0.1, gamma=scale, tol=0.0001, max_iter=5000 -> Accuracy: 0.9854385445488587




kernel=rbf, C=0.1, gamma=scale, tol=0.0001, max_iter=10000 -> Accuracy: 0.9854223350825302




kernel=rbf, C=0.1, gamma=auto, tol=0.001, max_iter=5000 -> Accuracy: 0.8014289671522328




kernel=rbf, C=0.1, gamma=auto, tol=0.001, max_iter=10000 -> Accuracy: 0.9815176666736249




kernel=rbf, C=0.1, gamma=auto, tol=0.0001, max_iter=5000 -> Accuracy: 0.8014289671522328




kernel=rbf, C=0.1, gamma=auto, tol=0.0001, max_iter=10000 -> Accuracy: 0.9815176666736249




kernel=rbf, C=0.1, gamma=0.01, tol=0.001, max_iter=5000 -> Accuracy: 0.39476760093888114




kernel=rbf, C=0.1, gamma=0.01, tol=0.001, max_iter=10000 -> Accuracy: 0.9849846813730464




kernel=rbf, C=0.1, gamma=0.01, tol=0.0001, max_iter=5000 -> Accuracy: 0.39476760093888114




kernel=rbf, C=0.1, gamma=0.01, tol=0.0001, max_iter=10000 -> Accuracy: 0.9849846813730464




kernel=rbf, C=0.1, gamma=0.1, tol=0.001, max_iter=5000 -> Accuracy: 0.3936437468828137




kernel=rbf, C=0.1, gamma=0.1, tol=0.001, max_iter=10000 -> Accuracy: 0.9849630687080246




kernel=rbf, C=0.1, gamma=0.1, tol=0.0001, max_iter=5000 -> Accuracy: 0.3936437468828137




kernel=rbf, C=0.1, gamma=0.1, tol=0.0001, max_iter=10000 -> Accuracy: 0.9849630687080246




kernel=rbf, C=0.1, gamma=1, tol=0.001, max_iter=5000 -> Accuracy: 0.9853376856724883




kernel=rbf, C=0.1, gamma=1, tol=0.001, max_iter=10000 -> Accuracy: 0.985433141187977




kernel=rbf, C=0.1, gamma=1, tol=0.0001, max_iter=5000 -> Accuracy: 0.9853376856724883




kernel=rbf, C=0.1, gamma=1, tol=0.0001, max_iter=10000 -> Accuracy: 0.9854799684504499




kernel=rbf, C=1, gamma=scale, tol=0.001, max_iter=5000 -> Accuracy: 0.986153559495279




kernel=rbf, C=1, gamma=scale, tol=0.001, max_iter=10000 -> Accuracy: 0.9861553605831354




kernel=rbf, C=1, gamma=scale, tol=0.0001, max_iter=5000 -> Accuracy: 0.986153559495279




kernel=rbf, C=1, gamma=scale, tol=0.0001, max_iter=10000 -> Accuracy: 0.9861553605831354




kernel=rbf, C=1, gamma=auto, tol=0.001, max_iter=5000 -> Accuracy: 0.9855303982941066




kernel=rbf, C=1, gamma=auto, tol=0.001, max_iter=10000 -> Accuracy: 0.9841652081641865




kernel=rbf, C=1, gamma=auto, tol=0.0001, max_iter=5000 -> Accuracy: 0.9855303982941066




kernel=rbf, C=1, gamma=auto, tol=0.0001, max_iter=10000 -> Accuracy: 0.9841634071087679




kernel=rbf, C=1, gamma=0.01, tol=0.001, max_iter=5000 -> Accuracy: 0.3951800399860383




kernel=rbf, C=1, gamma=0.01, tol=0.001, max_iter=10000 -> Accuracy: 0.9848892253061164




kernel=rbf, C=1, gamma=0.01, tol=0.0001, max_iter=5000 -> Accuracy: 0.3951800399860383




kernel=rbf, C=1, gamma=0.01, tol=0.0001, max_iter=10000 -> Accuracy: 0.9848892253061164




kernel=rbf, C=1, gamma=0.1, tol=0.001, max_iter=5000 -> Accuracy: 0.979401445111289




kernel=rbf, C=1, gamma=0.1, tol=0.001, max_iter=10000 -> Accuracy: 0.9816131231784642




kernel=rbf, C=1, gamma=0.1, tol=0.0001, max_iter=5000 -> Accuracy: 0.979401445111289




kernel=rbf, C=1, gamma=0.1, tol=0.0001, max_iter=10000 -> Accuracy: 0.9816131231784642




kernel=rbf, C=1, gamma=1, tol=0.001, max_iter=5000 -> Accuracy: 0.9854709630436067




kernel=rbf, C=1, gamma=1, tol=0.001, max_iter=10000 -> Accuracy: 0.9854079266067448




kernel=rbf, C=1, gamma=1, tol=0.0001, max_iter=5000 -> Accuracy: 0.9854709630436067




kernel=rbf, C=1, gamma=1, tol=0.0001, max_iter=10000 -> Accuracy: 0.9854079266067448




kernel=rbf, C=10, gamma=scale, tol=0.001, max_iter=5000 -> Accuracy: 0.9889433807308763




kernel=rbf, C=10, gamma=scale, tol=0.001, max_iter=10000 -> Accuracy: 0.9889848045513732




kernel=rbf, C=10, gamma=scale, tol=0.0001, max_iter=5000 -> Accuracy: 0.9889433807308763




kernel=rbf, C=10, gamma=scale, tol=0.0001, max_iter=10000 -> Accuracy: 0.9889866055905727




kernel=rbf, C=10, gamma=auto, tol=0.001, max_iter=5000 -> Accuracy: 0.9853268797454486




kernel=rbf, C=10, gamma=auto, tol=0.001, max_iter=10000 -> Accuracy: 0.9854673611760527




kernel=rbf, C=10, gamma=auto, tol=0.0001, max_iter=5000 -> Accuracy: 0.9853268797454486




kernel=rbf, C=10, gamma=auto, tol=0.0001, max_iter=10000 -> Accuracy: 0.9854673611760527




kernel=rbf, C=10, gamma=0.01, tol=0.001, max_iter=5000 -> Accuracy: 0.971154501407835




kernel=rbf, C=10, gamma=0.01, tol=0.001, max_iter=10000 -> Accuracy: 0.9820273581720997




kernel=rbf, C=10, gamma=0.01, tol=0.0001, max_iter=5000 -> Accuracy: 0.971154501407835




kernel=rbf, C=10, gamma=0.01, tol=0.0001, max_iter=10000 -> Accuracy: 0.9820273581720997




kernel=rbf, C=10, gamma=0.1, tol=0.001, max_iter=5000 -> Accuracy: 0.9669254940047457




kernel=rbf, C=10, gamma=0.1, tol=0.001, max_iter=10000 -> Accuracy: 0.9851179585819765




kernel=rbf, C=10, gamma=0.1, tol=0.0001, max_iter=5000 -> Accuracy: 0.9669254940047457




kernel=rbf, C=10, gamma=0.1, tol=0.0001, max_iter=10000 -> Accuracy: 0.9851179585819765




kernel=rbf, C=10, gamma=1, tol=0.001, max_iter=5000 -> Accuracy: 0.9861301464722498




kernel=rbf, C=10, gamma=1, tol=0.001, max_iter=10000 -> Accuracy: 0.9861463557115144




kernel=rbf, C=10, gamma=1, tol=0.0001, max_iter=5000 -> Accuracy: 0.9861301464722498




kernel=rbf, C=10, gamma=1, tol=0.0001, max_iter=10000 -> Accuracy: 0.9861463557115144




kernel=rbf, C=100, gamma=scale, tol=0.001, max_iter=5000 -> Accuracy: 0.9723450098249871




kernel=rbf, C=100, gamma=scale, tol=0.001, max_iter=10000 -> Accuracy: 0.990904719620433




kernel=rbf, C=100, gamma=scale, tol=0.0001, max_iter=5000 -> Accuracy: 0.9723450098249871




kernel=rbf, C=100, gamma=scale, tol=0.0001, max_iter=10000 -> Accuracy: 0.990904719620433




kernel=rbf, C=100, gamma=auto, tol=0.001, max_iter=5000 -> Accuracy: 0.9822182760474352




kernel=rbf, C=100, gamma=auto, tol=0.001, max_iter=10000 -> Accuracy: 0.98616616672102




kernel=rbf, C=100, gamma=auto, tol=0.0001, max_iter=5000 -> Accuracy: 0.9822182760474352




kernel=rbf, C=100, gamma=auto, tol=0.0001, max_iter=10000 -> Accuracy: 0.98616616672102




kernel=rbf, C=100, gamma=0.01, tol=0.001, max_iter=5000 -> Accuracy: 0.9842750695117871




kernel=rbf, C=100, gamma=0.01, tol=0.001, max_iter=10000 -> Accuracy: 0.9839148599040082




kernel=rbf, C=100, gamma=0.01, tol=0.0001, max_iter=5000 -> Accuracy: 0.9842750695117871




kernel=rbf, C=100, gamma=0.01, tol=0.0001, max_iter=10000 -> Accuracy: 0.9839148599040082




kernel=rbf, C=100, gamma=0.1, tol=0.001, max_iter=5000 -> Accuracy: 0.9778399349777847




kernel=rbf, C=100, gamma=0.1, tol=0.001, max_iter=10000 -> Accuracy: 0.9856168478188735




kernel=rbf, C=100, gamma=0.1, tol=0.0001, max_iter=5000 -> Accuracy: 0.9778399349777847




kernel=rbf, C=100, gamma=0.1, tol=0.0001, max_iter=10000 -> Accuracy: 0.9856168478188735




kernel=rbf, C=100, gamma=1, tol=0.001, max_iter=5000 -> Accuracy: 0.9896241750729045




kernel=rbf, C=100, gamma=1, tol=0.001, max_iter=10000 -> Accuracy: 0.9895701441888554




kernel=rbf, C=100, gamma=1, tol=0.0001, max_iter=5000 -> Accuracy: 0.9896241750729045




kernel=rbf, C=100, gamma=1, tol=0.0001, max_iter=10000 -> Accuracy: 0.9895701441888554

Melhores Hiperparâmetros encontrados: {'kernel': 'rbf', 'C': 100, 'gamma': 'scale', 'tol': 0.001, 'max_iter': 10000}
Melhor Acurácia Média: 0.990904719620433


In [16]:
duracao = fim - inicio
print(duracao)

36736.25600361824


In [17]:
print("\nMelhores Hiperparâmetros encontrados:", best_params)


Melhores Hiperparâmetros encontrados: {'kernel': 'rbf', 'C': 100, 'gamma': 'scale', 'tol': 0.001, 'max_iter': 10000}


In [18]:
for i in range(1,31):
    startTrain()
    print(i)

TypeError: startTrain() missing 2 required positional arguments: 'X' and 'y'

In [None]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy' , 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df

In [None]:
metrics_df.write_csv("metrics_results/unbalanced_svm_metrics_output.csv", separator=';')