## **Mô hình Softmax Regression**

## **Import các thư viện cần thiết**

In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from scipy import sparse
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


## **Dữ liệu**

In [78]:
df = pd.read_csv("../data/data_processed/data_processed.csv")
df.head()

Unnamed: 0,LB,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,...,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,4.0,0.0,4.0,2.0,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,2.0,0.0,5.0,2.0,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,2.0,0.0,6.0,2.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,4.0,0.0,5.0,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


## **Xây dựng Softmax Regression**

In [79]:
# Tách đặc trưng và nhãn
X = df.drop(columns=["NSP"]).values
# Nhãn: chuyển về 0, 1, 2
y = df["NSP"].values - 1 

# Chuẩn hóa đặc trưng
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
C = len(np.unique(y))


In [167]:
def convert_labels(y, C):
    Y = sparse.coo_matrix((np.ones_like(y), (y, np.arange(len(y)))), shape=(C, len(y))).toarray()
    return Y

def softmax_stable(Z):
    e_Z = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return e_Z / e_Z.sum(axis=0)

def softmax(Z):
    e_Z = np.exp(Z)
    return e_Z / e_Z.sum(axis=0)

def softmax_regression(X, y, W_init, eta=0.05, tol=1e-4, max_count=10000):
    W = [W_init]
    C = W_init.shape[1]
    Y = convert_labels(y, C)
    N = X.shape[1]
    d = X.shape[0]

    count = 0
    check_w_after = 1500
    while count < max_count:
        mix_id = np.random.permutation(N)
        for i in mix_id:
            xi = X[:, i].reshape(d, 1)
            yi = Y[:, i].reshape(C, 1)
            ai = softmax(np.dot(W[-1].T, xi))
            W_new = W[-1] + eta * xi.dot((yi - ai).T)
            count += 1
            if count % check_w_after == 0:
                if np.linalg.norm(W_new - W[-check_w_after]) < tol:
                    return W
            W.append(W_new)
    return W
    # for count in range(max_count):
    #     # Fixed order thay vì shuffle ngẫu nhiên
    #     for i in range(N):  
    #         xi = X[:, i].reshape(d, 1)
    #         yi = Y[:, i].reshape(C, 1)
    #         ai = softmax(np.dot(W[-1].T, xi))  # Dùng softmax_stable
    #         W_new = W[-1] + eta * xi.dot((yi - ai).T)
            
    #         count += 1
    #         if count % check_w_after == 0:
    #             if np.linalg.norm(W_new - W[-check_w_after]) < tol:
    #                 return W
    #         W.append(W_new)
    # return W

def pred(W, X):
    A = softmax_stable(W.T.dot(X))
    return np.argmax(A, axis=0)

In [168]:
def evaluate_model(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    cr = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return acc, cr, cm


## **Triển khai Softmax Regression với nhiều tỷ lệ train/validation: 4:1; 7:3; 6:4**

### **Giảm trước chia sau**

In [165]:
def softmax_pipeline(X, y, train_ratios=[0.8, 0.7, 0.6], use_pca=False, use_lda=False, n_components=None, lam=0):
    results = []
    
    ratio_labels = {
        0.8: "4:1",
        0.7: "7:3", 
        0.6: "6:4"
    }

    for train_ratio in train_ratios:
        method = "gốc"
        X_temp = X.copy()
        
        # Áp dụng PCA/LDA trước khi chia tập
        if use_pca:
            method = "PCA"
            pca = PCA(n_components=n_components)
            X_temp = pca.fit_transform(X_temp)
        elif use_lda:
            method = "LDA"
            lda = LDA(n_components=min(n_components, len(np.unique(y))-1))
            X_temp = lda.fit_transform(X_temp, y)

        # Chia tập train/test
        X_train, X_val, y_train, y_val = train_test_split(
            X_temp, y, 
            train_size=train_ratio, 
            stratify=y,
            random_state=42
        )
        
        # Thêm bias term sau khi chia
        X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
        X_val = np.hstack([np.ones((X_val.shape[0], 1)), X_val])
        
        # Chuyển đổi sang dạng cột
        X_train_T = X_train.T
        X_val_T = X_val.T
        
        # Khởi tạo trọng số
        d = X_train_T.shape[0]
        C = len(np.unique(y))
        W_init = np.random.randn(d, C)
        
        # Huấn luyện mô hình
        W = softmax_regression(X_train_T, y_train, W_init)[-1]
        
        # Dự đoán và đánh giá trên tập kiểm tra
        y_pred = pred(W, X_val_T)
        acc, cr, cm = evaluate_model(y_val, y_pred)
        
        # Dự đoán và đánh giá trên tập huấn luyện
        y_train_pred = pred(W, X_train_T)
        train_acc, train_cr, train_cm = evaluate_model(y_train, y_train_pred)
        
        # Lưu kết quả
        ratio_str = ratio_labels.get(train_ratio, f"{train_ratio*100:.0f}:{(1-train_ratio)*100:.0f}")
        
        print(f"\nMô hình Softmax Regression với dữ liệu {method}:")
        print(f"Tỷ lệ {ratio_str}")
        print(f"Accuracy (Train): {train_acc:.4f}")
        print(f"Accuracy (Test): {acc:.4f}")
        print("Classification Report (Train):")
        print(train_cr)
        print("Classification Report (Test):")
        print(cr)
        
        # So sánh accuracy để đánh giá overfitting
        results.append({
            'method': method,
            'train_ratio': ratio_str,
            'train_accuracy': train_acc,
            'test_accuracy': acc,
            'train_report': train_cr,
            'test_report': cr,
            'train_confusion_matrix': train_cm,
            'test_confusion_matrix': cm
        })
    
    return results


In [83]:
print(X_scaled.shape)  # Kiểm tra kích thước của X
print(y.shape)         # Kiểm tra kích thước của y


(2126, 21)
(2126,)


In [432]:
results = softmax_pipeline(X_scaled, y, train_ratios=[0.8])


Mô hình Softmax Regression với dữ liệu gốc:
Tỷ lệ 4:1
Accuracy (Train): 0.8882
Accuracy (Test): 0.8779
Classification Report (Train):
              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94      1323
         1.0       0.67      0.53      0.59       236
         2.0       0.82      0.82      0.82       141

    accuracy                           0.89      1700
   macro avg       0.81      0.77      0.79      1700
weighted avg       0.88      0.89      0.88      1700

Classification Report (Test):
              precision    recall  f1-score   support

         0.0       0.93      0.94      0.94       332
         1.0       0.63      0.63      0.63        59
         2.0       0.76      0.71      0.74        35

    accuracy                           0.88       426
   macro avg       0.77      0.76      0.77       426
weighted avg       0.88      0.88      0.88       426



In [480]:
results = softmax_pipeline(X_scaled, y, train_ratios=[0.7])


Mô hình Softmax Regression với dữ liệu gốc:
Tỷ lệ 7:3
Accuracy (Train): 0.9019
Accuracy (Test): 0.8934
Classification Report (Train):
              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95      1158
         1.0       0.68      0.69      0.68       207
         2.0       0.82      0.84      0.83       123

    accuracy                           0.90      1488
   macro avg       0.82      0.82      0.82      1488
weighted avg       0.90      0.90      0.90      1488

Classification Report (Test):
              precision    recall  f1-score   support

         0.0       0.96      0.94      0.95       497
         1.0       0.62      0.74      0.68        88
         2.0       0.84      0.72      0.78        53

    accuracy                           0.89       638
   macro avg       0.81      0.80      0.80       638
weighted avg       0.90      0.89      0.90       638



In [240]:
results = softmax_pipeline(X_scaled, y, train_ratios=[0.6])


Mô hình Softmax Regression với dữ liệu gốc:
Tỷ lệ 6:4
Accuracy (Train): 0.8871
Accuracy (Test): 0.8884
Classification Report (Train):
              precision    recall  f1-score   support

         0.0       0.94      0.96      0.95       992
         1.0       0.67      0.54      0.60       177
         2.0       0.73      0.80      0.76       106

    accuracy                           0.89      1275
   macro avg       0.78      0.77      0.77      1275
weighted avg       0.88      0.89      0.88      1275

Classification Report (Test):
              precision    recall  f1-score   support

         0.0       0.94      0.96      0.95       663
         1.0       0.65      0.58      0.61       118
         2.0       0.75      0.77      0.76        70

    accuracy                           0.89       851
   macro avg       0.78      0.77      0.77       851
weighted avg       0.88      0.89      0.89       851



#### **Thực hiện với dữ liệu đã giảm chiều bằng PCA**

In [166]:
results = softmax_pipeline(X_scaled, y, train_ratios=[0.8, 0.7, 0.6], use_pca=True, n_components=12)


Mô hình Softmax Regression với dữ liệu PCA:
Tỷ lệ 4:1
Accuracy (Train): 0.8788
Accuracy (Test): 0.8732
Classification Report (Train):
              precision    recall  f1-score   support

         0.0       0.93      0.94      0.94      1323
         1.0       0.61      0.57      0.59       236
         2.0       0.78      0.80      0.79       141

    accuracy                           0.88      1700
   macro avg       0.78      0.77      0.77      1700
weighted avg       0.88      0.88      0.88      1700

Classification Report (Test):
              precision    recall  f1-score   support

         0.0       0.93      0.94      0.94       332
         1.0       0.57      0.59      0.58        59
         2.0       0.81      0.71      0.76        35

    accuracy                           0.87       426
   macro avg       0.77      0.75      0.76       426
weighted avg       0.87      0.87      0.87       426


Mô hình Softmax Regression với dữ liệu PCA:
Tỷ lệ 7:3
Accuracy (Train): 

#### **Thực hiện với dữ liệu đã giảm chiều bằng LDA**

In [86]:
results = softmax_pipeline(X_scaled, y, train_ratios=[0.8, 0.7, 0.6], use_lda=True, n_components=2)


Mô hình Softmax Regression với dữ liệu LDA:
Tỷ lệ 4:1
Accuracy (Train): 0.8871
Accuracy (Test): 0.8803
Classification Report (Train):
              precision    recall  f1-score   support

         0.0       0.92      0.97      0.94      1323
         1.0       0.74      0.47      0.57       236
         2.0       0.77      0.82      0.80       141

    accuracy                           0.89      1700
   macro avg       0.81      0.75      0.77      1700
weighted avg       0.88      0.89      0.88      1700

Classification Report (Test):
              precision    recall  f1-score   support

         0.0       0.92      0.97      0.94       332
         1.0       0.68      0.46      0.55        59
         2.0       0.72      0.74      0.73        35

    accuracy                           0.88       426
   macro avg       0.77      0.72      0.74       426
weighted avg       0.87      0.88      0.87       426


Mô hình Softmax Regression với dữ liệu LDA:
Tỷ lệ 7:3
Accuracy (Train): 

In [87]:
for res in results:
    if res['method'] == 'LDA-2' and res['train_ratio'] == '7:3':
        print("Kết quả LDA-2 với tỉ lệ 7:3")
        # print(f"Accuracy: {res['acc']}")
        # print(f"Confusion Matrix:\n{res['cm']}")
        print(f"Classification Report:\n{res['cr']}\n")
        conf_matrix = res['accuracy ']

### **Chia trước giảm sau**

In [88]:
def softmax_pipeline(X, y, train_ratios=[0.8, 0.7, 0.6], use_pca=False, use_lda=False, n_components=None, lam=0):
    results = []

    ratio_labels = {
        0.8: "4:1",
        0.7: "7:3",
        0.6: "6:4"
    }

    for train_ratio in train_ratios:
        method = "gốc"
        
        # 1. Chia dữ liệu train/test trước
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, train_size=train_ratio, stratify=y, random_state=42)

        # 2. Giảm chiều (nếu có)
        if use_pca:
            method = "PCA"
            pca = PCA(n_components=n_components)
            X_train = pca.fit_transform(X_train)
            X_val = pca.transform(X_val)
        elif use_lda:
            method = "LDA"
            lda = LDA(n_components=min(n_components, len(np.unique(y))-1))
            X_train = lda.fit_transform(X_train, y_train)
            X_val = lda.transform(X_val)

        # 3. Thêm bias term (cột 1) vào CUỐI CÙNG sau khi đã xử lý dữ liệu
        X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
        X_val = np.hstack([np.ones((X_val.shape[0], 1)), X_val])

        # Chuyển đổi sang dạng cột để tính toán
        X_train_T = X_train.T
        X_val_T = X_val.T
        d = X_train_T.shape[0]  # Đã bao gồm cả bias term

        C = len(np.unique(y))
        W_init = np.random.randn(d, C)
        
        # Huấn luyện mô hình (truyền thêm lam nếu cần)
        W = softmax_regression(X_train_T, y_train, W_init)[-1]
        
        # Dự đoán và đánh giá trên tập kiểm tra
        y_pred = pred(W, X_val_T)
        acc, cr, cm = evaluate_model(y_val, y_pred)

        # Dự đoán và đánh giá trên tập huấn luyện
        y_train_pred = pred(W, X_train_T)
        train_acc, train_cr, train_cm = evaluate_model(y_train, y_train_pred)
        
        # Lưu kết quả
        ratio_str = ratio_labels.get(train_ratio, f"{train_ratio*100:.0f}:{(1-train_ratio)*100:.0f}")

        print("\nMô hình Softmax Regression với dữ liệu", method + ":")
        print(f"Tỷ lệ {ratio_str}")
        print(f"Accuracy (Train): {train_acc:.4f}")
        print(f"Accuracy (Test): {acc:.4f}")
        print("Classification Report (Train):")
        print(train_cr)
        print("Classification Report (Test):")
        print(cr)

        # So sánh accuracy để đánh giá overfitting
        
        results.append({
            'method': method,
            'train_ratio': ratio_str,
            'train_accuracy': train_acc,
            'test_accuracy': acc,
            'train_report': train_cr,
            'test_report': cr,
            'train_confusion_matrix': train_cm,
            'test_confusion_matrix': cm
        })

    return results


#### **Thực hiện trên tập dữ liệu giảm chiều bằng PCA**

In [267]:
results = softmax_pipeline(X_scaled, y, train_ratios=[0.8], use_pca=True, n_components=12)


Mô hình Softmax Regression với dữ liệu PCA:
Tỷ lệ 4:1
Accuracy (Train): 0.8800
Accuracy (Test): 0.8756
Classification Report (Train):
              precision    recall  f1-score   support

         0.0       0.92      0.95      0.94      1323
         1.0       0.61      0.56      0.58       236
         2.0       0.88      0.74      0.80       141

    accuracy                           0.88      1700
   macro avg       0.81      0.75      0.77      1700
weighted avg       0.88      0.88      0.88      1700

Classification Report (Test):
              precision    recall  f1-score   support

         0.0       0.93      0.95      0.94       332
         1.0       0.60      0.61      0.61        59
         2.0       0.82      0.66      0.73        35

    accuracy                           0.88       426
   macro avg       0.78      0.74      0.76       426
weighted avg       0.87      0.88      0.87       426



In [314]:
results = softmax_pipeline(X_scaled, y, train_ratios=[0.7], use_pca=True, n_components=12)


Mô hình Softmax Regression với dữ liệu PCA:
Tỷ lệ 7:3
Accuracy (Train): 0.8730
Accuracy (Test): 0.8589
Classification Report (Train):
              precision    recall  f1-score   support

         0.0       0.93      0.96      0.94      1158
         1.0       0.69      0.41      0.52       207
         2.0       0.62      0.82      0.71       123

    accuracy                           0.87      1488
   macro avg       0.75      0.73      0.72      1488
weighted avg       0.87      0.87      0.86      1488

Classification Report (Test):
              precision    recall  f1-score   support

         0.0       0.91      0.95      0.93       497
         1.0       0.57      0.41      0.48        88
         2.0       0.68      0.77      0.73        53

    accuracy                           0.86       638
   macro avg       0.72      0.71      0.71       638
weighted avg       0.85      0.86      0.85       638



In [375]:
results = softmax_pipeline(X_scaled, y, train_ratios=[0.6], use_pca=True, n_components=12)


Mô hình Softmax Regression với dữ liệu PCA:
Tỷ lệ 6:4
Accuracy (Train): 0.8831
Accuracy (Test): 0.8754
Classification Report (Train):
              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94       992
         1.0       0.70      0.52      0.60       177
         2.0       0.74      0.77      0.76       106

    accuracy                           0.88      1275
   macro avg       0.79      0.75      0.76      1275
weighted avg       0.88      0.88      0.88      1275

Classification Report (Test):
              precision    recall  f1-score   support

         0.0       0.92      0.95      0.94       663
         1.0       0.61      0.50      0.55       118
         2.0       0.78      0.76      0.77        70

    accuracy                           0.88       851
   macro avg       0.77      0.74      0.75       851
weighted avg       0.87      0.88      0.87       851



#### **Thực hiện với dữ liệu đã giảm chiều bằng LDA**

In [381]:
results = softmax_pipeline(X_scaled, y, train_ratios=[0.8], use_lda=True, n_components=2)


Mô hình Softmax Regression với dữ liệu LDA:
Tỷ lệ 4:1
Accuracy (Train): 0.8859
Accuracy (Test): 0.8873
Classification Report (Train):
              precision    recall  f1-score   support

         0.0       0.94      0.94      0.94      1323
         1.0       0.63      0.70      0.66       236
         2.0       0.85      0.68      0.76       141

    accuracy                           0.89      1700
   macro avg       0.81      0.77      0.79      1700
weighted avg       0.89      0.89      0.89      1700

Classification Report (Test):
              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95       332
         1.0       0.63      0.69      0.66        59
         2.0       0.75      0.60      0.67        35

    accuracy                           0.89       426
   macro avg       0.78      0.75      0.76       426
weighted avg       0.89      0.89      0.89       426



In [391]:
results = softmax_pipeline(X_scaled, y, train_ratios=[0.7], use_lda=True, n_components=2)


Mô hình Softmax Regression với dữ liệu LDA:
Tỷ lệ 7:3
Accuracy (Train): 0.8871
Accuracy (Test): 0.8824
Classification Report (Train):
              precision    recall  f1-score   support

         0.0       0.94      0.94      0.94      1158
         1.0       0.63      0.72      0.67       207
         2.0       0.85      0.68      0.76       123

    accuracy                           0.89      1488
   macro avg       0.81      0.78      0.79      1488
weighted avg       0.89      0.89      0.89      1488

Classification Report (Test):
              precision    recall  f1-score   support

         0.0       0.95      0.94      0.94       497
         1.0       0.60      0.74      0.66        88
         2.0       0.82      0.62      0.71        53

    accuracy                           0.88       638
   macro avg       0.79      0.77      0.77       638
weighted avg       0.89      0.88      0.88       638



In [411]:
results = softmax_pipeline(X_scaled, y, train_ratios=[0.6], use_lda=True, n_components=2)


Mô hình Softmax Regression với dữ liệu LDA:
Tỷ lệ 6:4
Accuracy (Train): 0.8863
Accuracy (Test): 0.8837
Classification Report (Train):
              precision    recall  f1-score   support

         0.0       0.92      0.97      0.94       992
         1.0       0.67      0.59      0.63       177
         2.0       0.90      0.62      0.74       106

    accuracy                           0.89      1275
   macro avg       0.83      0.73      0.77      1275
weighted avg       0.88      0.89      0.88      1275

Classification Report (Test):
              precision    recall  f1-score   support

         0.0       0.92      0.97      0.94       663
         1.0       0.64      0.58      0.60       118
         2.0       0.88      0.60      0.71        70

    accuracy                           0.88       851
   macro avg       0.81      0.71      0.75       851
weighted avg       0.88      0.88      0.88       851

