# **Chuyển bài toán về dạng hồi quy**

Xét phương pháp trong Nhóm 2 và Nhóm 3, lựa chọn 1 phân lớp và dựa vào giá trị của hàm quyết định cho phân lớp đó (ví dụ hàm softmax – logistic hoặc hàm đánh giá score trong SVM) để chuyển bài toán về dạng hồi quy. Dữ liệu chia train : test như ý (a).
- Thực hiện ít nhất 02 mô hình hồi quy trên tập dữ liệu với đầu ra mới xây dựng.
- Đưa ra kết quả trong trường hợp dữ liệu đầu vào nguyên bản và dữ liệu giảm về còn 1/3 số chiều.
- Đánh giá, so sánh các kết quả thực nghiệm trong mỗi trường hợp. Giải thích xem tại sao lại như vậy.


## **Thư viện**

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, classification_report
import seaborn as sns

## **Dữ liệu**

In [14]:
df = pd.read_csv("../../data/data_processed/data_processed.csv")
df.head()

Unnamed: 0,LB,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,...,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,4.0,0.0,4.0,2.0,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,2.0,0.0,5.0,2.0,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,2.0,0.0,6.0,2.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,4.0,0.0,5.0,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [15]:
# Tách đặc trưng và nhãn
X = df.drop(columns=["NSP"]).values
# Nhãn
y = df["NSP"].values 

# Chuẩn hóa đặc trưng
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Giảm chiều với PCA, giảm còn 1/3 số chiều
pca = PCA(n_components=X.shape[1]//3)  
X_pca = pca.fit_transform(X_scaled)


## **Chia train/test**

In [16]:
test_sizes = [0.3, 0.4, 0.2]

## **Xây dựng mô hình**

In [17]:
def evaluate_split(test_size, pca_ratio=1.0):
    X_use = X_scaled
    if pca_ratio < 1.0:
        pca = PCA(n_components=int(X_scaled.shape[1] * pca_ratio))
        X_use = pca.fit_transform(X_scaled)

    X_train, X_test, y_train, y_test = train_test_split(
        X_use, y, test_size=test_size, stratify=y, random_state=42)

    # Huấn luyện mô hình Logistic Regression đa lớp để lấy xác suất
    model_cls = LogisticRegression( solver='lbfgs', max_iter=1000)
    model_cls.fit(X_train, y_train)

    # Chọn lớp để hồi quy (lớp 1)
    class_label = 1
    y_train_reg = model_cls.predict_proba(X_train)[:, class_label]
    y_test_reg = model_cls.predict_proba(X_test)[:, class_label]

    # Hồi quy tuyến tính
    lr = LinearRegression()
    lr.fit(X_train, y_train_reg)
    pred_train_lr = lr.predict(X_train)
    pred_test_lr = lr.predict(X_test)

    # Hồi quy KNN
    knn = KNeighborsRegressor(n_neighbors=5)
    knn.fit(X_train, y_train_reg)
    pred_train_knn = knn.predict(X_train)
    pred_test_knn = knn.predict(X_test)

    return {
        "split": f"{int((1 - test_size)*100)}:{int(test_size*100)}",
        "original_dim": X_use.shape[1],
        "lr_train_mse": mean_squared_error(y_train_reg, pred_train_lr),
        "lr_test_mse": mean_squared_error(y_test_reg, pred_test_lr),
        "knn_train_mse": mean_squared_error(y_train_reg, pred_train_knn),
        "knn_test_mse": mean_squared_error(y_test_reg, pred_test_knn)
    }


In [None]:
results = []

for test_ratio in [0.3, 0.4, 0.2]:  # tương ứng 7:3, 6:4, 8:2 
    results.append(evaluate_split(test_ratio, pca_ratio=1.0))         # Dữ liệu gốc
    results.append(evaluate_split(test_ratio, pca_ratio=1/3))         # Dữ liệu giảm 1/3

df_results = pd.DataFrame(results)
df_results


Unnamed: 0,split,original_dim,lr_train_mse,lr_test_mse,knn_train_mse,knn_test_mse
0,70:30,21,0.02268,0.02416,0.005085,0.010784
1,70:30,7,0.0151,0.015869,0.00154,0.003338
2,60:40,21,0.022099,0.023002,0.005364,0.009495
3,60:40,7,0.014473,0.01528,0.001744,0.003149
4,80:20,21,0.022298,0.024338,0.004952,0.010174
5,80:20,7,0.015265,0.015133,0.001597,0.002935


In [19]:
for i, row in df_results.iterrows():
    print(f"Split {row['split']} | Chiều: {row['original_dim']}")
    print(f"  Linear Regression - Train MSE: {row['lr_train_mse']:.4f}, Test MSE: {row['lr_test_mse']:.4f}")
    print(f"  KNN Regression    - Train MSE: {row['knn_train_mse']:.4f}, Test MSE: {row['knn_test_mse']:.4f}")
    print("-" * 60)


Split 70:30 | Chiều: 21
  Linear Regression - Train MSE: 0.0227, Test MSE: 0.0242
  KNN Regression    - Train MSE: 0.0051, Test MSE: 0.0108
------------------------------------------------------------
Split 70:30 | Chiều: 7
  Linear Regression - Train MSE: 0.0151, Test MSE: 0.0159
  KNN Regression    - Train MSE: 0.0015, Test MSE: 0.0033
------------------------------------------------------------
Split 60:40 | Chiều: 21
  Linear Regression - Train MSE: 0.0221, Test MSE: 0.0230
  KNN Regression    - Train MSE: 0.0054, Test MSE: 0.0095
------------------------------------------------------------
Split 60:40 | Chiều: 7
  Linear Regression - Train MSE: 0.0145, Test MSE: 0.0153
  KNN Regression    - Train MSE: 0.0017, Test MSE: 0.0031
------------------------------------------------------------
Split 80:20 | Chiều: 21
  Linear Regression - Train MSE: 0.0223, Test MSE: 0.0243
  KNN Regression    - Train MSE: 0.0050, Test MSE: 0.0102
----------------------------------------------------------