# **Chuyển bài toán về dạng hồi quy**

Xét phương pháp trong Nhóm 2 và Nhóm 3, lựa chọn 1 phân lớp và dựa vào giá trị của hàm quyết định cho phân lớp đó (ví dụ hàm softmax – logistic hoặc hàm đánh giá score trong SVM) để chuyển bài toán về dạng hồi quy. Dữ liệu chia train : test như ý (a).
- Thực hiện ít nhất 02 mô hình hồi quy trên tập dữ liệu với đầu ra mới xây dựng.
- Đưa ra kết quả trong trường hợp dữ liệu đầu vào nguyên bản và dữ liệu giảm về còn 1/3 số chiều.
- Đánh giá, so sánh các kết quả thực nghiệm trong mỗi trường hợp. Giải thích xem tại sao lại như vậy.


## **Thư viện**

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, classification_report
import seaborn as sns

## **Dữ liệu**

In [13]:
df = pd.read_csv("../../data/data_processed/data_processed.csv")
df.head()

Unnamed: 0,LB,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,...,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,4.0,0.0,4.0,2.0,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,2.0,0.0,5.0,2.0,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,2.0,0.0,6.0,2.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,4.0,0.0,5.0,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [14]:
# Tách đặc trưng và nhãn
X = df.drop(columns=["NSP"]).values
# Nhãn
y = df["NSP"].values 

# Chuẩn hóa đặc trưng
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Giảm chiều với PCA, giảm còn 1/3 số chiều
pca = PCA(n_components=X.shape[1]//3)  
X_pca = pca.fit_transform(X_scaled)


## **Chia train/test**

In [15]:
test_sizes = [0.3, 0.4, 0.2]

## **Xây dựng mô hình**

In [21]:
from sklearn.model_selection import train_test_split, cross_val_score

def find_best_k(X_train, y_train, k_range=range(1, 21), cv=5):
    best_k = None
    best_score = float("inf")
    k_scores = []

    for k in k_range:
        knn = KNeighborsRegressor(n_neighbors=k)
        neg_mse_scores = cross_val_score(knn, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
        mean_mse = -np.mean(neg_mse_scores)
        k_scores.append(mean_mse)

        if mean_mse < best_score:
            best_score = mean_mse
            best_k = k

    return best_k, k_scores

def evaluate_split(test_size, pca_ratio=1.0, verbose=True):
    X_use = X_scaled
    if pca_ratio < 1.0:
        pca = PCA(n_components=int(X_scaled.shape[1] * pca_ratio))
        X_use = pca.fit_transform(X_scaled)

    X_train, X_test, y_train, y_test = train_test_split(
        X_use, y, test_size=test_size, stratify=y, random_state=42)

    # Logistic Regression đa lớp để lấy xác suất lớp 1
    model_cls = LogisticRegression(solver='lbfgs', max_iter=1000)
    model_cls.fit(X_train, y_train)
    class_label = 0
    y_train_reg = model_cls.predict_proba(X_train)[:, class_label]
    y_test_reg = model_cls.predict_proba(X_test)[:, class_label]


    # Tìm k tốt nhất cho KNN bằng cross-validation trên tập huấn luyện
    best_k, k_mse_scores = find_best_k(X_train, y_train_reg)
    # if verbose:
    #     print(f"Best k found by CV: {best_k} with CV MSE: {min(k_mse_scores):.4f}")
    #     plt.plot(range(1, len(k_mse_scores)+1), k_mse_scores, marker='o')
    #     plt.xlabel('k')
    #     plt.ylabel('Cross-validated MSE')
    #     plt.title('Tìm k tốt nhất cho KNN Regression')
    #     plt.grid(True)
    #     plt.show()

    # Hồi quy KNN với best_k
    knn = KNeighborsRegressor(n_neighbors=best_k)
    knn.fit(X_train, y_train_reg)
    pred_train_knn = knn.predict(X_train)
    pred_test_knn = knn.predict(X_test)

    if verbose:
        print(f"Split {int((1 - test_size)*100)}:{int(test_size*100)} | Chiều: {X_use.shape[1]}")
        print(f"KNN Regression (k={best_k}) - Train MSE: {mean_squared_error(y_train_reg, pred_train_knn):.4f}, Test MSE: {mean_squared_error(y_test_reg, pred_test_knn):.4f}")
        print(f"Train R²: {r2_score(y_train_reg, pred_train_knn):.4f}, Test R²: {r2_score(y_test_reg, pred_test_knn):.4f}")
        print("-" * 60)

    return {
        "split": f"{int((1 - test_size)*100)}:{int(test_size*100)}",
        "original_dim": X_use.shape[1],
        "knn_train_mse": mean_squared_error(y_train_reg, pred_train_knn),
        "knn_test_mse": mean_squared_error(y_test_reg, pred_test_knn),
        "knn_train_r2": r2_score(y_train_reg, pred_train_knn),
        "knn_test_r2": r2_score(y_test_reg, pred_test_knn),
        "best_k": best_k,
        "k_mse_scores": k_mse_scores
    }


In [22]:
results = []

for test_ratio in [0.3, 0.4, 0.2]:  # tương ứng 7:3, 6:4, 8:2 
    results.append(evaluate_split(test_ratio, pca_ratio=1.0))         # Dữ liệu gốc
    results.append(evaluate_split(test_ratio, pca_ratio=1/3))         # Dữ liệu giảm 1/3

df_results = pd.DataFrame(results)
df_results


Split 70:30 | Chiều: 21
KNN Regression (k=2) - Train MSE: 0.0034, Test MSE: 0.0139
Train R²: 0.9710, Test R²: 0.8833
------------------------------------------------------------
Split 70:30 | Chiều: 7
KNN Regression (k=4) - Train MSE: 0.0026, Test MSE: 0.0058
Train R²: 0.9751, Test R²: 0.9465
------------------------------------------------------------
Split 60:40 | Chiều: 21
KNN Regression (k=3) - Train MSE: 0.0053, Test MSE: 0.0130
Train R²: 0.9540, Test R²: 0.8861
------------------------------------------------------------
Split 60:40 | Chiều: 7
KNN Regression (k=3) - Train MSE: 0.0023, Test MSE: 0.0063
Train R²: 0.9780, Test R²: 0.9412
------------------------------------------------------------
Split 80:20 | Chiều: 21
KNN Regression (k=3) - Train MSE: 0.0047, Test MSE: 0.0115
Train R²: 0.9603, Test R²: 0.9036
------------------------------------------------------------
Split 80:20 | Chiều: 7
KNN Regression (k=4) - Train MSE: 0.0026, Test MSE: 0.0042
Train R²: 0.9755, Test R²: 0.9

Unnamed: 0,split,original_dim,knn_train_mse,knn_test_mse,knn_train_r2,knn_test_r2,best_k,k_mse_scores
0,70:30,21,0.003401,0.013904,0.971024,0.883347,2,"[0.014493601346374374, 0.011300668904488814, 0..."
1,70:30,7,0.002642,0.005755,0.97507,0.9465,4,"[0.007452471058380092, 0.0062072802740282705, ..."
2,60:40,21,0.005347,0.012956,0.954017,0.886143,3,"[0.016649230916252877, 0.014749684156237147, 0..."
3,60:40,7,0.002307,0.006313,0.977953,0.941158,3,"[0.007510631455886765, 0.006311623938529168, 0..."
4,80:20,21,0.004668,0.011533,0.960251,0.903623,3,"[0.01391930452867978, 0.011785202132326724, 0...."
5,80:20,7,0.002606,0.004234,0.975502,0.960083,4,"[0.006423061052594645, 0.006049480673199524, 0..."


In [25]:
def evaluate_split(test_size, pca_ratio=1.0):
    X_use = X_scaled
    if pca_ratio < 1.0:
        pca = PCA(n_components=int(X_scaled.shape[1] * pca_ratio))
        X_use = pca.fit_transform(X_scaled)

    X_train, X_test, y_train, y_test = train_test_split(
        X_use, y, test_size=test_size, stratify=y, random_state=42)

    # Huấn luyện mô hình Logistic Regression đa lớp để lấy xác suất
    model_cls = LogisticRegression( solver='lbfgs', max_iter=1000)
    model_cls.fit(X_train, y_train)

    # Chọn lớp để hồi quy (lớp 1)
    class_label = 0
    y_train_reg = model_cls.predict_proba(X_train)[:, class_label]
    y_test_reg = model_cls.predict_proba(X_test)[:, class_label]

    # Hồi quy tuyến tính
    lr = LinearRegression()
    lr.fit(X_train, y_train_reg)
    pred_train_lr = lr.predict(X_train)
    pred_test_lr = lr.predict(X_test)

    # Hồi quy KNN
    knn = KNeighborsRegressor(n_neighbors=3)
    knn.fit(X_train, y_train_reg)
    pred_train_knn = knn.predict(X_train)
    pred_test_knn = knn.predict(X_test)

    return {
        "split": f"{int((1 - test_size)*100)}:{int(test_size*100)}",
        "original_dim": X_use.shape[1],
        "lr_train_mse": mean_squared_error(y_train_reg, pred_train_lr),
        "lr_test_mse": mean_squared_error(y_test_reg, pred_test_lr),
        "lr_train_r2": r2_score(y_train_reg, pred_train_lr),
        "lr_test_r2": r2_score(y_test_reg, pred_test_lr),
        "knn_train_mse": mean_squared_error(y_train_reg, pred_train_knn),
        "knn_test_mse": mean_squared_error(y_test_reg, pred_test_knn),
        "knn_train_r2": r2_score(y_train_reg, pred_train_knn),
        "knn_test_r2": r2_score(y_test_reg, pred_test_knn)
    }


In [26]:
results = []

for test_ratio in [0.3, 0.4, 0.2]:  # tương ứng 7:3, 6:4, 8:2 
    results.append(evaluate_split(test_ratio, pca_ratio=1.0))         # Dữ liệu gốc
    results.append(evaluate_split(test_ratio, pca_ratio=1/3))         # Dữ liệu giảm 1/3

df_results = pd.DataFrame(results)
df_results


Unnamed: 0,split,original_dim,lr_train_mse,lr_test_mse,lr_train_r2,lr_test_r2,knn_train_mse,knn_test_mse,knn_train_r2,knn_test_r2
0,70:30,21,0.023623,0.023133,0.798724,0.80591,0.00477,0.013765,0.959357,0.884508
1,70:30,7,0.020999,0.021574,0.80182,0.799449,0.002284,0.005704,0.978445,0.946972
2,60:40,21,0.02374,0.02332,0.795854,0.795065,0.005347,0.012956,0.954017,0.886143
3,60:40,7,0.02077,0.020759,0.801533,0.806506,0.002307,0.006313,0.977953,0.941158
4,80:20,21,0.023607,0.023682,0.798977,0.802105,0.004668,0.011533,0.960251,0.903623
5,80:20,7,0.02091,0.022113,0.8034,0.791507,0.002125,0.004109,0.980016,0.961258


In [27]:
for i, row in df_results.iterrows():
    print(f"Split {row['split']} | Chiều: {row['original_dim']}")
    print(f"  Linear Regression - Train MSE: {row['lr_train_mse']:.4f}, Test MSE: {row['lr_test_mse']:.4f}")
    print(f"                      Train R² : {row['lr_train_r2']:.4f}, Test R² : {row['lr_test_r2']:.4f}")
    print(f"  KNN Regression    - Train MSE: {row['knn_train_mse']:.4f}, Test MSE: {row['knn_test_mse']:.4f}")
    print(f"                      Train R² : {row['knn_train_r2']:.4f}, Test R² : {row['knn_test_r2']:.4f}")
    print("-" * 60)


Split 70:30 | Chiều: 21
  Linear Regression - Train MSE: 0.0236, Test MSE: 0.0231
                      Train R² : 0.7987, Test R² : 0.8059
  KNN Regression    - Train MSE: 0.0048, Test MSE: 0.0138
                      Train R² : 0.9594, Test R² : 0.8845
------------------------------------------------------------
Split 70:30 | Chiều: 7
  Linear Regression - Train MSE: 0.0210, Test MSE: 0.0216
                      Train R² : 0.8018, Test R² : 0.7994
  KNN Regression    - Train MSE: 0.0023, Test MSE: 0.0057
                      Train R² : 0.9784, Test R² : 0.9470
------------------------------------------------------------
Split 60:40 | Chiều: 21
  Linear Regression - Train MSE: 0.0237, Test MSE: 0.0233
                      Train R² : 0.7959, Test R² : 0.7951
  KNN Regression    - Train MSE: 0.0053, Test MSE: 0.0130
                      Train R² : 0.9540, Test R² : 0.8861
------------------------------------------------------------
Split 60:40 | Chiều: 7
  Linear Regression - Train