In [2]:
#Hyperparameter tuning
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, f1_score

df = pd.read_csv("../../data/data_processed/data_processed.csv")  
X = df.drop(columns=["NSP"])  
y = df["NSP"]  

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Định nghĩa mô hình và lưới tham số
svm = SVC(kernel="rbf", class_weight="balanced")
param_grid = {
    "C": [0.1, 1, 10, 100],
    "gamma": [0.01, 0.1, 1, 10]
}
scorer = make_scorer(f1_score, average="macro")

# Nested Cross-validation: tìm bộ tham số tốt nhất
inner_cv = KFold(n_splits=4, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

clf = GridSearchCV(estimator=svm, param_grid=param_grid, cv=inner_cv, scoring=scorer)
nested_scores = cross_val_score(clf, X_scaled, y, cv=outer_cv, scoring=scorer)

# Fit lại trên toàn bộ dữ liệu với tham số tốt nhất
clf.fit(X_scaled, y)

print("F1_macro (mean ± std) từ nested CV:", nested_scores.mean(), "±", nested_scores.std())
print("Best params từ nested CV:", clf.best_params_)

F1_macro (mean ± std) từ nested CV: 0.8676637253080092 ± 0.015054140065058981
Best params từ nested CV: {'C': 100, 'gamma': 0.1}


## Dữ liệu gốc 

In [7]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

ratios = [(0.8, 0.2), (0.7, 0.3), (0.6, 0.4)]

for train_ratio, test_ratio in ratios:
    print(f"\nTrain:Test = {int(train_ratio*10)}:{int(test_ratio*10)}")

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_ratio, random_state=42, stratify=y)

    model = SVC(kernel="rbf", class_weight="balanced", decision_function_shape='ovr', C=100, gamma=0.1, random_state=42)
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print("Classification Report (Train)")
    print(classification_report(y_train, y_train_pred, digits=4))

    print("Classification Report (Test)")
    print(classification_report(y_test, y_test_pred, digits=4))


Train:Test = 8:2
Classification Report (Train)
              precision    recall  f1-score   support

         1.0     1.0000    0.9909    0.9954      1323
         2.0     0.9476    0.9958    0.9711       236
         3.0     0.9929    0.9929    0.9929       141

    accuracy                         0.9918      1700
   macro avg     0.9802    0.9932    0.9865      1700
weighted avg     0.9921    0.9918    0.9919      1700

Classification Report (Test)
              precision    recall  f1-score   support

         1.0     0.9496    0.9639    0.9567       332
         2.0     0.7273    0.6780    0.7018        59
         3.0     0.8529    0.8286    0.8406        35

    accuracy                         0.9131       426
   macro avg     0.8433    0.8235    0.8330       426
weighted avg     0.9108    0.9131    0.9118       426


Train:Test = 7:3
Classification Report (Train)
              precision    recall  f1-score   support

         1.0     1.0000    0.9896    0.9948      1158
    

## Dữ liệu giảm chiều PCA 

In [8]:
from sklearn.decomposition import PCA

ratios = [(0.8, 0.2), (0.7, 0.3), (0.6, 0.4)]

for train_ratio, test_ratio in ratios:
    print(f"Train:Test = {int(train_ratio*10)}:{int(test_ratio*10)}")

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_ratio, random_state=42, stratify=y)

    # Giảm chiều với PCA, chỉ fit trên tập train
    pca = PCA(n_components=12, random_state=42)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Huấn luyện mô hình SVM trên dữ liệu đã giảm chiều
    model = SVC(kernel="rbf", class_weight="balanced", decision_function_shape='ovr', C=100, gamma=0.1, random_state=42)
    model.fit(X_train_pca, y_train)

    y_train_pred = model.predict(X_train_pca)
    y_test_pred = model.predict(X_test_pca)

    print("Classification Report (Train)")
    print(classification_report(y_train, y_train_pred, digits=4))

    print("Classification Report (Test)")
    print(classification_report(y_test, y_test_pred, digits=4))


Train:Test = 8:2
Classification Report (Train)
              precision    recall  f1-score   support

         1.0     1.0000    0.9834    0.9916      1323
         2.0     0.9105    0.9915    0.9493       236
         3.0     0.9859    0.9929    0.9894       141

    accuracy                         0.9853      1700
   macro avg     0.9655    0.9893    0.9768      1700
weighted avg     0.9864    0.9853    0.9856      1700

Classification Report (Test)
              precision    recall  f1-score   support

         1.0     0.9486    0.9458    0.9472       332
         2.0     0.6897    0.6780    0.6838        59
         3.0     0.7568    0.8000    0.7778        35

    accuracy                         0.8967       426
   macro avg     0.7984    0.8079    0.8029       426
weighted avg     0.8970    0.8967    0.8968       426

Train:Test = 7:3
Classification Report (Train)
              precision    recall  f1-score   support

         1.0     1.0000    0.9810    0.9904      1158
      

## Dữ liệu giảm chiều LDA 

In [9]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

ratios = [(0.8, 0.2), (0.7, 0.3), (0.6, 0.4)]

for train_ratio, test_ratio in ratios:
    print(f"Train:Test = {int(train_ratio*10)}:{int(test_ratio*10)}")

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_ratio, random_state=42, stratify=y)

    # Giảm chiều bằng LDA, chỉ fit trên tập train
    lda = LDA(n_components=2)  
    X_train_lda = lda.fit_transform(X_train, y_train)
    X_test_lda = lda.transform(X_test)

    model = SVC(kernel="rbf", class_weight="balanced", decision_function_shape='ovr', C=100, gamma=0.1, random_state=42)
    model.fit(X_train_lda, y_train)

    y_train_pred = model.predict(X_train_lda)
    y_test_pred = model.predict(X_test_lda)

    print("Classification Report (Train)")
    print(classification_report(y_train, y_train_pred, digits=4))

    print("Classification Report (Test)")
    print(classification_report(y_test, y_test_pred, digits=4))

Train:Test = 8:2
Classification Report (Train)
              precision    recall  f1-score   support

         1.0     0.9936    0.8216    0.8995      1323
         2.0     0.4966    0.9195    0.6449       236
         3.0     0.7041    0.8440    0.7677       141

    accuracy                         0.8371      1700
   macro avg     0.7314    0.8617    0.7707      1700
weighted avg     0.9006    0.8371    0.8532      1700

Classification Report (Test)
              precision    recall  f1-score   support

         1.0     0.9893    0.8373    0.9070       332
         2.0     0.4902    0.8475    0.6211        59
         3.0     0.6512    0.8000    0.7179        35

    accuracy                         0.8357       426
   macro avg     0.7102    0.8283    0.7487       426
weighted avg     0.8924    0.8357    0.8519       426

Train:Test = 7:3
Classification Report (Train)
              precision    recall  f1-score   support

         1.0     0.9948    0.8307    0.9054      1158
      

## Nhận xét:
