## Lựa chọn phân lớp 1(Normal) và dựa vào giá trị của hàm softmax cho phân lớp đó chuyển bài toán về dạng hồi quy 

In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

df = pd.read_csv("../../data/data_processed/data_processed.csv")
X = df.drop(columns=["NSP"])
y = df["NSP"].astype(int)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

clf = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=500)
clf.fit(X_scaled, y)

A_sklearn = clf.predict_proba(X_scaled)
y_reg = A_sklearn[:, 0] #chọn xác suất của phân lớp 1 làm đầu ra y mới 
print(y_reg)



[0.01274304 0.99982427 0.99906056 ... 0.32245187 0.37441337 0.58847689]




## Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

def evaluate_linear_regression(X, y_reg):
    results = {}
    ratios = {
        "8:2": 0.2,
        "7:3": 0.3,
        "6:4": 0.4
    }

    for label, test_size in ratios.items():
        X_train, X_test, y_train, y_test = train_test_split(
            X, y_reg, test_size=test_size, random_state=42
        )

        model = LinearRegression()
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        mse_train = mean_squared_error(y_train, y_train_pred)
        r2_train = r2_score(y_train, y_train_pred)

        mse_test = mean_squared_error(y_test, y_test_pred)
        r2_test = r2_score(y_test, y_test_pred)

        results[label] = {
            "Train MSE": round(mse_train, 4),
            "Train R2": round(r2_train, 4),
            "Test MSE": round(mse_test, 4),
            "Test R2": round(r2_test, 4)
        }

    return results

### Dữ liệu gốc 

In [13]:
results_lr_origin = evaluate_linear_regression(X_scaled, y_reg)
print("Linear Regression original data:")
import pprint
pprint.pprint(results_lr_origin)

Linear Regression original data:
{'6:4': {'Test MSE': 0.0239,
         'Test R2': 0.7875,
         'Train MSE': 0.0223,
         'Train R2': 0.8117},
 '7:3': {'Test MSE': 0.0239,
         'Test R2': 0.7918,
         'Train MSE': 0.0224,
         'Train R2': 0.8076},
 '8:2': {'Test MSE': 0.0227,
         'Test R2': 0.7975,
         'Train MSE': 0.0228,
         'Train R2': 0.8054}}


### Dữ liệu giảm 1/3 số chiều 

In [14]:
from sklearn.decomposition import PCA

pca = PCA(n_components=7)
X_reduced = pca.fit_transform(X_scaled)

results_lr_pca = evaluate_linear_regression(X_reduced, y_reg)
print("Linear Regression PCA(n_components = 7):")
pprint.pprint(results_lr_pca)

Linear Regression PCA(n_components = 7):
{'6:4': {'Test MSE': 0.0331,
         'Test R2': 0.7059,
         'Train MSE': 0.0299,
         'Train R2': 0.748},
 '7:3': {'Test MSE': 0.034,
         'Test R2': 0.7042,
         'Train MSE': 0.0299,
         'Train R2': 0.7435},
 '8:2': {'Test MSE': 0.0345,
         'Test R2': 0.6921,
         'Train MSE': 0.0302,
         'Train R2': 0.7418}}


## KNN Regression

In [15]:
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer, r2_score

# Định nghĩa mô hình và lưới tham số
knn = KNeighborsRegressor()
param_grid = {
    "n_neighbors": list(range(3, 21))  # Thử từ k=3 đến k=20
}
scorer = make_scorer(r2_score)

# Nested CV: Tìm k tốt nhất theo R²
inner_cv = KFold(n_splits=4, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=inner_cv, scoring=scorer)
nested_scores = cross_val_score(grid_search, X_scaled, y_reg, cv=outer_cv, scoring=scorer)

# Huấn luyện lại mô hình tốt nhất trên toàn bộ tập huấn luyện
grid_search.fit(X_scaled, y_reg)

print("R² (mean ± std) từ nested CV:", nested_scores.mean(), "±", nested_scores.std())
print("Best k từ nested CV:", grid_search.best_params_["n_neighbors"])


R² (mean ± std) từ nested CV: 0.9153597945306355 ± 0.01632043856458432
Best k từ nested CV: 3


In [16]:
def evaluate_knn_regression(X, y_reg):
    results = {}
    ratios = {
        "8:2": 0.2,
        "7:3": 0.3,
        "6:4": 0.4
    }

    for label, test_size in ratios.items():
        X_train, X_test, y_train, y_test = train_test_split(
            X, y_reg, test_size=test_size, random_state=42
        )

        model = KNeighborsRegressor(n_neighbors=3)  
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        mse_train = mean_squared_error(y_train, y_train_pred)
        r2_train = r2_score(y_train, y_train_pred)

        mse_test = mean_squared_error(y_test, y_test_pred)
        r2_test = r2_score(y_test, y_test_pred)

        results[label] = {
            "Train MSE": round(mse_train, 4),
            "Train R2": round(r2_train, 4),
            "Test MSE": round(mse_test, 4),
            "Test R2": round(r2_test, 4)
        }

    return results

### Dữ liệu gốc 

In [17]:
results_knn_origin = evaluate_knn_regression(X_scaled, y_reg)
print("KNN Regression original data:")
pprint.pprint(results_knn_origin)

KNN Regression original data:
{'6:4': {'Test MSE': 0.0138,
         'Test R2': 0.8776,
         'Train MSE': 0.0051,
         'Train R2': 0.957},
 '7:3': {'Test MSE': 0.0121,
         'Test R2': 0.8942,
         'Train MSE': 0.005,
         'Train R2': 0.9571},
 '8:2': {'Test MSE': 0.0114,
         'Test R2': 0.8986,
         'Train MSE': 0.0045,
         'Train R2': 0.9619}}


### Dữ liệu giảm chiều

In [18]:
results_knn_pca = evaluate_knn_regression(X_reduced, y_reg)
print("KNN Regression PCA(n_components = 7):")
pprint.pprint(results_knn_pca)

KNN Regression PCA(n_components = 7):
{'6:4': {'Test MSE': 0.0143,
         'Test R2': 0.873,
         'Train MSE': 0.0058,
         'Train R2': 0.9514},
 '7:3': {'Test MSE': 0.0128,
         'Test R2': 0.8881,
         'Train MSE': 0.0056,
         'Train R2': 0.9523},
 '8:2': {'Test MSE': 0.0115,
         'Test R2': 0.8976,
         'Train MSE': 0.0053,
         'Train R2': 0.9543}}


## Nhận xét:
