In [1]:
import polars as pl
import numpy as np
import time
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import KFold
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df_polars_raiz = pl.read_parquet('../dataset.parquet')

In [3]:
df_polars = df_polars_raiz.sample(fraction=0.01, seed=42)

In [4]:
import ipaddress

def ip_to_int(ip: str) -> int:
    try:
        return int(ipaddress.ip_address(ip))  # Funciona tanto para IPv4 quanto IPv6
    except ValueError:
        return None


In [5]:
#df_polars = df_polars.with_columns([
#    pl.col('id.resp_h').map_elements(ip_to_int).alias('id.resp_h'),
#    pl.col('id.orig_h').map_elements(ip_to_int).alias('id.orig_h')
#])

In [6]:
df_polars = df_polars.with_columns([
    pl.col('duration').fill_null(0),
    pl.col('orig_bytes').fill_null(0),
    pl.col('resp_bytes').fill_null(0)
])

In [7]:
lista_colunas = df_polars.columns
colunas_para_spearman = ['id.resp_p', 'history', 'conn_state', 'id.orig_p', 'orig_ip_bytes', 'label']             
#['detailed-label', 'id.resp_p', 'history', 'id.orig_h', 'conn_state', 'id.orig_p', 'orig_ip_bytes']
colunas_para_dropar = [col for col in lista_colunas if col not in colunas_para_spearman]
df_polars = df_polars.drop(colunas_para_dropar)

In [8]:
df_polars = df_polars.drop_nulls()

In [9]:
X = df_polars.drop('label')
y = df_polars['label']       

In [10]:
#X = X.to_numpy()
#X[:, 0] = np.array([ip_to_int(ip) for ip in X[:, 0]])

# Treinamento

In [11]:
results = []

In [12]:
def startTrain(X, y, kernel, C, gamma, tol, max_iter):    
    kfold = KFold(n_splits=5, shuffle=True)
    results_fold = []

    for train_idx, test_idx in kfold.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        #rus = RandomUnderSampler()
        #X_train_resampled, y_train_resampled = rus.fit_resample(X_train_scaled, y_train)

        svm = SVC(kernel=kernel, C=C, gamma=gamma, tol=tol, max_iter=max_iter)
        
        start_training = time.time()
        svm.fit(X_train_scaled, y_train)
        end_training = time.time()

        y_pred = svm.predict(X_test_scaled)
        training_duration = end_training - start_training
        evaluation_duration = time.time() - end_training

        confusion = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = confusion.ravel()
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        specificity = tn / (tn + fp)
        f1 = f1_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
        

        avaliacao = [accuracy, balanced_accuracy, precision, recall, specificity, f1, false_alarm_rate, tn, fp, fn, tp, training_duration, evaluation_duration]
        #print(avaliacao)
        results_fold.append(avaliacao)

    results_fold_array = np.array(results_fold, dtype=np.float32)
    mean_results = np.mean(results_fold_array, axis=0)
    results.append(["SVM"] + mean_results.tolist())

In [13]:
#Melhores Hiperparâmetros encontrados: kernel=rbf, C=100, gamma=scale, tol=0.001, max_iter=10000 -> Accuracy: 0.990904719620433


In [16]:
for i in range(1,11):
    startTrain(X=X, y=y, kernel='rbf', C=100, gamma='scale', tol=0.001, max_iter=10000)
    print(i)



1




2




3




4




5




6




7




8




9


In [17]:
metrics_df = pl.DataFrame(
    results,
    schema=['Algorithm', 'Accuracy', 'Balanced Accuracy' , 'Precision', 'Recall', 'Specificity', 'F1-score', 'False Alarm Rate', 'tn', 'fp', 'fn', 'tp', 'training_duration', 'evaluation_duration']
)
metrics_df

  return dispatch(args[0].__class__)(*args, **kw)


Algorithm,Accuracy,Balanced Accuracy,Precision,Recall,Specificity,F1-score,False Alarm Rate,tn,fp,fn,tp,training_duration,evaluation_duration
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""SVM""",0.990743,0.993233,0.999402,0.989567,0.9969,0.994458,0.0031,17748.0,55.200001,972.799988,92270.601562,201.951202,52.099487
"""SVM""",0.984565,0.970201,0.990632,0.991332,0.94907,0.990895,0.05093,16897.400391,905.799988,808.200012,92435.203125,183.745102,51.041237
"""SVM""",0.940459,0.907655,0.972214,0.955501,0.85981,0.963692,0.14019,15352.400391,2450.800049,4161.0,89082.398438,192.567413,52.821461
"""SVM""",0.922396,0.876831,0.960817,0.94426,0.809402,0.952374,0.190598,14374.0,3429.199951,5188.399902,88055.0,192.51474,51.627178
"""SVM""",0.988151,0.985658,0.996587,0.989307,0.982008,0.992927,0.017992,17484.400391,318.799988,997.0,92246.398438,203.35408,52.643818
"""SVM""",0.925415,0.873348,0.959766,0.950131,0.796566,0.954877,0.203434,14168.799805,3634.399902,4648.0,88595.398438,206.336868,53.250683
"""SVM""",0.933046,0.940008,0.98754,0.929766,0.95025,0.955173,0.04975,16919.199219,884.0,6551.0,86692.398438,208.069489,53.518993
"""SVM""",0.929821,0.923009,0.984562,0.932836,0.913181,0.953048,0.086819,16262.200195,1541.0,6252.200195,86991.203125,195.284302,51.213276
"""SVM""",0.911837,0.915739,0.977552,0.910001,0.921477,0.9375,0.078523,16410.0,1393.199951,8397.0,84846.398438,192.870728,52.427856
"""SVM""",0.991739,0.993868,0.999422,0.990734,0.997002,0.995059,0.002998,17749.800781,53.400002,864.0,92379.398438,203.011078,52.98539


In [18]:
metrics_df.write_csv("metrics_results/unbalanced_svm_metrics_output.csv", separator=';')