In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import os

In [23]:
df = pd.read_csv("../../data/data_processed/data_processed.csv")
df.columns[0]

'LB'

In [47]:
def evaluate_model(X_train, X_test, y_train, y_test, kernel):
    # model = SVC(kernel=kernel, random_state=42)
    model = SVC(kernel=kernel,  class_weight="balanced", decision_function_shape='ovr', C=100, gamma=0.1, random_state=42)
    model.fit(X_train, y_train)

    print("[Train] Classification Report:")
    print(classification_report(y_train, model.predict(X_train)))

    print("[Test] Classification Report:")
    print(classification_report(y_test, model.predict(X_test)))

# Xử lý dữ liệu gốc theo tỉ lệ và áp dụng SVM
def process_original_data(df, ratio, kernel):
    X = df.drop('NSP', axis=1)
    y = df['NSP']

    scaler = StandardScaler()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=1 - ratio, stratify=y, random_state=42
    )

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    print(f"\nMô hình SVM với dữ liệu gốc - Tỉ lệ = {int(ratio*10)}:{int((1-ratio)*10)} --- Kernel = {kernel} ---")
    evaluate_model(X_train, X_test, y_train, y_test, kernel=kernel)

In [37]:
def process_presplit_data(train_path, test_path, kernel, target_col, method):
    # Đọc dữ liệu
    train_data = pd.read_csv(train_path).iloc[:, 1:]
    test_data = pd.read_csv(test_path).iloc[:, 1:]

    # Tách X và y
    X_train = train_data.drop('NSP', axis=1).values
    y_train = train_data['NSP'].values

    X_test = test_data.drop('NSP', axis=1).values
    y_test = test_data['NSP'].values

    print(f"\nMô hình SVM với dữ liệu giảm chiều bằng {method} tỉ lệ {split} --- Kernel = {kernel} ---")


    # Huấn luyện và đánh giá
    model = SVC(kernel=kernel,  class_weight="balanced", decision_function_shape='ovr', C=100, gamma=0.1, random_state=42)
    model.fit(X_train, y_train)

    print("[Train] Classification Report:")
    print(classification_report(y_train, model.predict(X_train)))

    print("[Test] Classification Report:")
    print(classification_report(y_test, model.predict(X_test)))

Dữ liệu gốc

In [48]:
kernels = ['linear','sigmoid', 'poly', 'rbf']
for kernel in kernels:
    for r in [0.8, 0.7, 0.6]:
        process_original_data(df, r, kernel)


Mô hình SVM với dữ liệu gốc - Tỉ lệ = 8:1 --- Kernel = linear ---
[Train] Classification Report:
              precision    recall  f1-score   support

         1.0       0.99      0.89      0.94      1323
         2.0       0.59      0.88      0.71       236
         3.0       0.79      0.92      0.85       141

    accuracy                           0.89      1700
   macro avg       0.79      0.90      0.83      1700
weighted avg       0.92      0.89      0.90      1700

[Test] Classification Report:
              precision    recall  f1-score   support

         1.0       0.99      0.89      0.93       332
         2.0       0.56      0.85      0.68        59
         3.0       0.72      0.80      0.76        35

    accuracy                           0.87       426
   macro avg       0.76      0.84      0.79       426
weighted avg       0.91      0.87      0.88       426


Mô hình SVM với dữ liệu gốc - Tỉ lệ = 7:3 --- Kernel = linear ---
[Train] Classification Report:
            

Dữ liệu giảm chiều bằng PCA

In [39]:
file_paths_pca = {
    "80_20": {
        'train': r'..\..\data\dimension_reduction\pca\train_80.csv',
        'test': r'..\..\data\dimension_reduction\pca\test_20.csv'
    },
    "70_30": {
        'train': r'..\..\data\dimension_reduction\pca\train_70.csv',
        'test': r'..\..\data\dimension_reduction\pca\test_30.csv'
    },
    "60_40": {
        'train': r'..\..\data\dimension_reduction\pca\train_60.csv',
        'test': r'..\..\data\dimension_reduction\pca\test_40.csv'
    }
}
for kernel in kernels:
    for split, paths in file_paths_pca.items():
        process_presplit_data(
            train_path=paths['train'],
            test_path=paths['test'],
            kernel = kernel,
            target_col='Target',
            method=f'PCA'
        )


Mô hình SVM với dữ liệu giảm chiều bằng PCA tỉ lệ 80_20 --- Kernel = linear ---
[Train] Classification Report:
              precision    recall  f1-score   support

         1.0       0.99      0.87      0.93      1323
         2.0       0.55      0.86      0.67       236
         3.0       0.76      0.90      0.82       141

    accuracy                           0.87      1700
   macro avg       0.77      0.88      0.81      1700
weighted avg       0.91      0.87      0.88      1700

[Test] Classification Report:
              precision    recall  f1-score   support

         1.0       0.98      0.86      0.91       332
         2.0       0.51      0.85      0.63        59
         3.0       0.76      0.83      0.79        35

    accuracy                           0.85       426
   macro avg       0.75      0.84      0.78       426
weighted avg       0.90      0.85      0.87       426


Mô hình SVM với dữ liệu giảm chiều bằng PCA tỉ lệ 70_30 --- Kernel = linear ---
[Train] Classif

Dữ liệu giảm chiều bằng LDA

In [42]:
file_paths_lda = {
    "80_20": {
        'train': r'..\..\data\dimension_reduction\lda\train_80.csv',
        'test': r'..\..\data\dimension_reduction\lda\test_20.csv'
    },
    "70_30": {
        'train': r'..\..\data\dimension_reduction\lda\train_70.csv',
        'test': r'..\..\data\dimension_reduction\lda\test_30.csv'
    },
    "60_40": {
        'train': r'..\..\data\dimension_reduction\lda\train_60.csv',
        'test': r'..\..\data\dimension_reduction\lda\test_40.csv'
    }
}
for kernel in kernels:
    for split, paths in file_paths_lda.items():
        process_presplit_data(
            train_path=paths['train'],
            test_path=paths['test'],
            kernel = kernel,
            target_col='Target',
            method=f'LDA'
        )


Mô hình SVM với dữ liệu giảm chiều bằng LDA tỉ lệ 80_20 --- Kernel = linear ---
[Train] Classification Report:
              precision    recall  f1-score   support

         1.0       0.98      0.84      0.91      1323
         2.0       0.51      0.85      0.64       236
         3.0       0.68      0.84      0.75       141

    accuracy                           0.84      1700
   macro avg       0.72      0.84      0.76      1700
weighted avg       0.89      0.84      0.86      1700

[Test] Classification Report:
              precision    recall  f1-score   support

         1.0       0.98      0.85      0.91       332
         2.0       0.51      0.83      0.63        59
         3.0       0.66      0.77      0.71        35

    accuracy                           0.84       426
   macro avg       0.72      0.82      0.75       426
weighted avg       0.89      0.84      0.86       426


Mô hình SVM với dữ liệu giảm chiều bằng LDA tỉ lệ 70_30 --- Kernel = linear ---
[Train] Classif