In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score,
    auc,
    average_precision_score,
    classification_report,
    confusion_matrix,
    precision_recall_curve,
    roc_auc_score,
    roc_curve,
    f1_score,
    make_scorer
)
from sklearn.model_selection import (
    cross_val_predict,
    cross_val_score,
    train_test_split,
    StratifiedKFold,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, RobustScaler, StandardScaler
from sklearn import svm
import optuna
import joblib

project_root = "D:/Projects/Summer-2024-ECE-597-Group8-main"
data_path = "D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/processed/features_tfidf_labels.csv"
model_path_template = "D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_{}.pkl"
random_state = 42
test_size = 0.1

df = pd.read_csv(data_path)
df = df.drop(columns=["js_code"])

def log_transform(x):
    return np.log1p(x)

pipeline_log_transform_RobustScaler = Pipeline(
    steps=[
        ("log_transform", FunctionTransformer(log_transform)),
        ("RobustScaler", RobustScaler()),
    ]
)

features_preprocessor = ColumnTransformer(
    transformers=[
        (
            "log_transform_RobustScaler",
            pipeline_log_transform_RobustScaler,
            [
                "Word_Count",
                "Homoglyphs",
                "Total_Abnormal_Count",
                "html_tags",
            ],
        ),
    ],
    remainder="passthrough",
)

X = features_preprocessor.fit_transform(df.drop(columns=["Label"]))
y = df["Label"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state
)

def objective(trial):
    svc_c = trial.suggest_float('svc_c', 1e-10, 1e10, log=True)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    degree = trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3
    gamma = trial.suggest_float('gamma', 1e-10, 1e1, log=True) if kernel in ['poly', 'rbf', 'sigmoid'] else 'scale'
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
    
    classifier = svm.SVC(C=svc_c, kernel=kernel, degree=degree, gamma=gamma, class_weight=class_weight)

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('svm', classifier)
    ])

    f1_scorer = make_scorer(f1_score, pos_label=1)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring=f1_scorer, n_jobs=-1)

    # Save the model after each trial
    model_path = model_path_template.format(trial.number)
    joblib.dump(pipeline, model_path)
    print(f"Model saved to {model_path}")

    return np.mean(scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40, n_jobs=6)

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Train final model with best hyperparameters
best_params = study.best_params
best_svm = svm.SVC(C=best_params['svc_c'], kernel=best_params['kernel'], degree=best_params.get('degree', 3), gamma=best_params.get('gamma', 'scale'), class_weight=best_params['class_weight'])

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', best_svm)
])

pipeline.fit(X_train, y_train)

# Save the best model
final_model_path = model_path_template.format('best')
joblib.dump(pipeline, final_model_path)
print(f"Best model saved to {final_model_path}")

# Evaluate on the test set
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1'])
print("Classification Report:\n")
print(report)


[I 2024-07-05 23:22:30,283] A new study created in memory with name: no-name-a04badbb-d6d4-4bbe-ac05-13bd7819a66f
[I 2024-07-05 23:24:07,983] Trial 1 finished with value: 0.6501213680000735 and parameters: {'svc_c': 5834987.2842162, 'kernel': 'sigmoid', 'gamma': 1.438641964596868e-09, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.6501213680000735.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_1.pkl


[I 2024-07-05 23:25:08,194] Trial 3 finished with value: 0.0 and parameters: {'svc_c': 0.0003070893585264844, 'kernel': 'sigmoid', 'gamma': 0.006097959483018337, 'class_weight': None}. Best is trial 1 with value: 0.6501213680000735.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_3.pkl


[I 2024-07-05 23:25:16,278] Trial 4 finished with value: 0.8486180331130496 and parameters: {'svc_c': 734001.0289799175, 'kernel': 'rbf', 'gamma': 5.2313492629576304e-05, 'class_weight': None}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_4.pkl


[I 2024-07-05 23:28:57,274] Trial 0 finished with value: 0.055867530597552194 and parameters: {'svc_c': 1.8267186223263757e-08, 'kernel': 'sigmoid', 'gamma': 0.01164851685568258, 'class_weight': 'balanced'}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_0.pkl


[I 2024-07-05 23:31:25,769] Trial 6 finished with value: 0.0 and parameters: {'svc_c': 7.928458057767441e-07, 'kernel': 'sigmoid', 'gamma': 7.579680501017338, 'class_weight': 'balanced'}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_6.pkl


[I 2024-07-05 23:37:07,477] Trial 7 finished with value: 0.6502377532594443 and parameters: {'svc_c': 152629396.093487, 'kernel': 'poly', 'degree': 5, 'gamma': 0.00010774870819288945, 'class_weight': 'balanced'}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_7.pkl


[I 2024-07-05 23:41:30,655] Trial 9 finished with value: 0.055867530597552194 and parameters: {'svc_c': 3.783561004770789e-08, 'kernel': 'linear', 'class_weight': 'balanced'}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_9.pkl


[I 2024-07-05 23:45:52,682] Trial 10 finished with value: 0.0 and parameters: {'svc_c': 8.235291502166399e-10, 'kernel': 'linear', 'class_weight': 'balanced'}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_10.pkl


[I 2024-07-05 23:47:12,275] Trial 11 finished with value: 0.7687179686393136 and parameters: {'svc_c': 0.4479701913269496, 'kernel': 'linear', 'class_weight': None}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_11.pkl


[I 2024-07-05 23:47:51,156] Trial 12 finished with value: 0.0 and parameters: {'svc_c': 1.2710437412275493e-05, 'kernel': 'sigmoid', 'gamma': 3.159529306064764e-05, 'class_weight': None}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_12.pkl


[I 2024-07-05 23:51:09,583] Trial 14 finished with value: 0.0 and parameters: {'svc_c': 2.7617064173545294, 'kernel': 'poly', 'degree': 5, 'gamma': 3.998794007115515e-06, 'class_weight': None}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_14.pkl


[I 2024-07-05 23:52:55,062] Trial 15 finished with value: 0.5454866197822668 and parameters: {'svc_c': 7631.089167776772, 'kernel': 'rbf', 'gamma': 1.5220299443940393e-08, 'class_weight': None}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_15.pkl


[I 2024-07-05 23:54:29,184] Trial 16 finished with value: 0.0 and parameters: {'svc_c': 263.74141875725695, 'kernel': 'rbf', 'gamma': 4.315253636245025e-09, 'class_weight': None}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_16.pkl


[I 2024-07-05 23:55:31,336] Trial 17 finished with value: 0.0 and parameters: {'svc_c': 3.838285204366758, 'kernel': 'rbf', 'gamma': 4.1043885665132566e-07, 'class_weight': None}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_17.pkl


[I 2024-07-05 23:56:57,568] Trial 18 finished with value: 0.0 and parameters: {'svc_c': 0.023874759457344998, 'kernel': 'rbf', 'gamma': 3.226247927255182e-07, 'class_weight': None}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_18.pkl


[I 2024-07-05 23:57:20,099] Trial 19 finished with value: 0.7361514666113096 and parameters: {'svc_c': 0.005758431772260238, 'kernel': 'linear', 'class_weight': None}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_19.pkl


[I 2024-07-06 00:01:07,737] Trial 2 finished with value: 0.6987932801143462 and parameters: {'svc_c': 3734811.66518877, 'kernel': 'sigmoid', 'gamma': 3.3637402007387133e-06, 'class_weight': 'balanced'}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_2.pkl


[I 2024-07-06 01:25:19,442] Trial 5 finished with value: 0.5903489618477477 and parameters: {'svc_c': 0.10518955212511016, 'kernel': 'poly', 'degree': 5, 'gamma': 1.0393456944375061, 'class_weight': None}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_5.pkl


[I 2024-07-06 01:27:52,844] Trial 8 finished with value: 0.4004465208956778 and parameters: {'svc_c': 7614649.981426936, 'kernel': 'rbf', 'gamma': 0.16763769603155165, 'class_weight': None}. Best is trial 4 with value: 0.8486180331130496.


Model saved to D:/Projects/Summer-2024-ECE-597-Group8-main/Summer-2024-ECE-597-Group8-main/data/models/svm_model_trial_8.pkl
