# KNN PHÂN LOẠI ĐA LỚP 

In [8]:
import numpy as np
#Khoảng cách giữa hai điểm dữ liệu
def distance(array, value):
  array = np.array(array)
  return np.linalg.norm(array - value, axis=1)  # Euclidean distance

#Tìm chỉ số K điểm gần nhất 
def find_nearest_index(array, value, k):
  array_D = distance(array, value)
  return np.argsort(array_D)[:k]

#Tìm nhãn xuất hiện nhiều nhất 
def highest_rank(arr):
  count_num = {}
  for i in arr:
    count_num[i] = arr.count(i)
  return max(count_num, key=count_num.get)

#Dự đoán nhãn 
def knn_predict(X_train, Y_train, X_val, K=5):
  y_pred = np.zeros(len(X_val), dtype=int)

  for j in range(len(X_val)):
    indexes = find_nearest_index(X_train, X_val[j], K)
    y_nearest = [Y_train[i] for i in indexes]
    y_pred[j] = highest_rank(y_nearest)

  return y_pred


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize

#hàm tính các chỉ số đánh giá độ chính xác của mô hình 
def evaluate_model(y_true, y_pred, average='macro'):
  acc = accuracy_score(y_true, y_pred)
  prec = precision_score(y_true, y_pred, average=average, zero_division=0)
  rec = recall_score(y_true, y_pred, average=average, zero_division=0)
  f1 = f1_score(y_true, y_pred, average=average, zero_division=0)

  # Chuẩn bị cho tính AUC
  classes = sorted(np.unique(y_true))
  y_true_bin = label_binarize(y_true, classes=classes)
  y_pred_bin = label_binarize(y_pred, classes=classes)
    
  try:
    auc = roc_auc_score(y_true_bin, y_pred_bin, average=average, multi_class='ovr')
  except ValueError:
    auc = None  # Khi chỉ có 1 lớp trong tập test
    
  return {
    'Accuracy': acc,
    'Precision': prec,
    'Recall': rec,
    'F1-Score': f1,
    'AUC': auc
  }

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

def run_knn_experiment(data_path, target_col="NSP", test_ratios=[0.2, 0.3, 0.4], Ks=[3, 5, 7, 11, 15, 20]):
  # Đọc dữ liệu
  df = pd.read_csv(data_path)
  if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])

  X = df.drop(target_col, axis=1).values
  y = df[target_col].values

  results = []

  for ratio in test_ratios:
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=ratio, stratify=y, random_state=42)
    
    y_train_list = list(y_train)
    y_test_list = list(y_test)

    for k in Ks:
      y_pred = knn_predict(X_train, y_train_list, X_test, K=k)
      metrics = evaluate_model(y_test_list, y_pred)

      results.append({
        "Test Size": f"{int(ratio * 100)}%",
        "K": k,
        "Accuracy": round(metrics["Accuracy"], 4),
        "Precision": round(metrics["Precision"], 4),
        "Recall": round(metrics["Recall"], 4),
        "F1-Score": round(metrics["F1-Score"], 4),
        "AUC": round(metrics["AUC"], 4) if metrics["AUC"] is not None else "N/A"
      })

  return pd.DataFrame(results)


## Dữ liệu gốc 

In [12]:
result_origin = run_knn_experiment("../../data/data_processed/data_processed.csv")
display(result_origin)

Unnamed: 0,Test Size,K,Accuracy,Precision,Recall,F1-Score,AUC
0,20%,3,0.9038,0.8423,0.7707,0.8024,0.8352
1,20%,5,0.9038,0.852,0.7668,0.804,0.8319
2,20%,7,0.885,0.8149,0.7231,0.7618,0.8025
3,20%,11,0.8897,0.8387,0.7251,0.7717,0.8017
4,20%,15,0.8803,0.8194,0.7118,0.7561,0.7893
5,20%,20,0.8732,0.8178,0.6786,0.7314,0.7674
6,30%,3,0.9044,0.8423,0.7894,0.8137,0.8473
7,30%,5,0.8997,0.8398,0.7762,0.8046,0.838
8,30%,7,0.8824,0.8078,0.7388,0.7692,0.8108
9,30%,11,0.8793,0.8128,0.7175,0.7571,0.7951


## Dữ liệu giảm chiều (giảm trước, chia sau)

### Giảm chiều bằng PCA 12 thành phần chính 

In [13]:
result_pca = run_knn_experiment("../../data/dimension_reduction/pca/pca_all.csv")
display(result_pca)

Unnamed: 0,Test Size,K,Accuracy,Precision,Recall,F1-Score,AUC
0,20%,3,0.885,0.797,0.7448,0.7685,0.816
1,20%,5,0.8803,0.7878,0.7157,0.7458,0.7953
2,20%,7,0.8779,0.8117,0.707,0.75,0.7877
3,20%,11,0.8732,0.7989,0.674,0.7224,0.7677
4,20%,15,0.8732,0.7989,0.674,0.7224,0.7677
5,20%,20,0.8803,0.822,0.677,0.7315,0.7705
6,30%,3,0.895,0.8345,0.7536,0.7883,0.824
7,30%,5,0.8777,0.8045,0.7013,0.7436,0.7858
8,30%,7,0.884,0.8306,0.7164,0.7623,0.7954
9,30%,11,0.873,0.8042,0.6936,0.7379,0.7793


### Giảm chiều bằng LDA (2 chiều)

In [14]:
result_lda = run_knn_experiment("../../data/dimension_reduction/lda/lda_all.csv")
display(result_lda)

Unnamed: 0,Test Size,K,Accuracy,Precision,Recall,F1-Score,AUC
0,20%,3,0.892,0.8017,0.7664,0.783,0.8348
1,20%,5,0.8779,0.7762,0.7341,0.753,0.8132
2,20%,7,0.8826,0.7915,0.7407,0.763,0.8175
3,20%,11,0.8967,0.8271,0.7653,0.7905,0.8364
4,20%,15,0.8897,0.8097,0.7538,0.7762,0.828
5,20%,20,0.8944,0.8179,0.769,0.7891,0.8378
6,30%,3,0.8887,0.8114,0.7839,0.7949,0.8451
7,30%,5,0.8777,0.799,0.7624,0.7741,0.8322
8,30%,7,0.8824,0.8069,0.7519,0.7742,0.8234
9,30%,11,0.8966,0.8376,0.7767,0.7994,0.8429


## Dữ liệu giảm chiều (chia trước giảm sau)

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split

def evaluate_knn(train_path, test_path, Ks=[3, 5, 7, 11, 15, 20]):
  # Đọc dữ liệu
  df_train = pd.read_csv(train_path)
  df_test = pd.read_csv(test_path)


  # Tách X và y
  X_train = df_train.drop(columns=['Unnamed: 0', "NSP"], axis=1).values
  y_train = df_train["NSP"].values
  X_test = df_test.drop(columns=['Unnamed: 0', "NSP"], axis=1).values
  y_test = df_test["NSP"].values

  y_train_list = list(y_train)
  y_test_list = list(y_test)
  
  # Mã hóa label để tính AUC
  classes = np.unique(np.concatenate([y_train, y_test]))
  y_test_bin = label_binarize(y_test, classes=classes)

  results = []

  for k in Ks:
    y_pred = knn_predict(X_train, y_train_list, X_test, K=k)
    metrics = evaluate_model(y_test_list, y_pred)

    results.append({
      "K": k,
      "Accuracy": round(metrics["Accuracy"], 4),
      "Precision": round(metrics["Precision"], 4),
      "Recall": round(metrics["Recall"], 4),
      "F1-Score": round(metrics["F1-Score"], 4),
      "AUC": round(metrics["AUC"], 4) if metrics["AUC"] is not None else "N/A"
    })

  return pd.DataFrame(results)


### PCA 12 thành phần chính 

In [16]:
pca_results_80 = evaluate_knn(
    "../../data/dimension_reduction/pca/train_80.csv",
    "../../data/dimension_reduction/pca/test_20.csv"
)

print("Kết quả PCA 80:20")
display(pca_results_80)

pca_results_70 = evaluate_knn(
    "../../data/dimension_reduction/pca/train_70.csv",
    "../../data/dimension_reduction/pca/test_30.csv"
)

print("Kết quả PCA 70:30")
display(pca_results_70)

pca_results_60 = evaluate_knn(
    "../../data/dimension_reduction/pca/train_70.csv",
    "../../data/dimension_reduction/pca/test_30.csv"
)

print("Kết quả PCA 60:40")
display(pca_results_60)


Kết quả PCA 80:20


Unnamed: 0,K,Accuracy,Precision,Recall,F1-Score,AUC
0,3,0.885,0.797,0.7448,0.7685,0.816
1,5,0.8803,0.7878,0.7157,0.7458,0.7953
2,7,0.8779,0.8131,0.7023,0.7474,0.7841
3,11,0.8732,0.7989,0.674,0.7224,0.7677
4,15,0.8709,0.7943,0.673,0.7202,0.7667
5,20,0.8803,0.822,0.677,0.7315,0.7705


Kết quả PCA 70:30


Unnamed: 0,K,Accuracy,Precision,Recall,F1-Score,AUC
0,3,0.895,0.8313,0.7567,0.7892,0.8256
1,5,0.8793,0.8075,0.7019,0.7452,0.7864
2,7,0.8824,0.8261,0.7126,0.7582,0.7932
3,11,0.8746,0.8065,0.6974,0.7412,0.7824
4,15,0.8683,0.797,0.686,0.7298,0.7737
5,20,0.8793,0.8235,0.6907,0.7419,0.7781


Kết quả PCA 60:40


Unnamed: 0,K,Accuracy,Precision,Recall,F1-Score,AUC
0,3,0.895,0.8313,0.7567,0.7892,0.8256
1,5,0.8793,0.8075,0.7019,0.7452,0.7864
2,7,0.8824,0.8261,0.7126,0.7582,0.7932
3,11,0.8746,0.8065,0.6974,0.7412,0.7824
4,15,0.8683,0.797,0.686,0.7298,0.7737
5,20,0.8793,0.8235,0.6907,0.7419,0.7781


### LDA (2 chiều)

In [17]:
lda_results_80 = evaluate_knn(
    "../../data/dimension_reduction/lda/train_80.csv",
    "../../data/dimension_reduction/lda/test_20.csv"
)

print("Kết quả LDA 80:20")
display(lda_results_80)

lda_results_70 = evaluate_knn(
    "../../data/dimension_reduction/lda/train_70.csv",
    "../../data/dimension_reduction/lda/test_30.csv"
)

print("Kết quả LDA 70:30")
display(lda_results_70)

lda_results_60 = evaluate_knn(
    "../../data/dimension_reduction/lda/train_60.csv",
    "../../data/dimension_reduction/lda/test_40.csv"
)

print("Kết quả LDA 60:40")
display(lda_results_60)

Kết quả LDA 80:20


Unnamed: 0,K,Accuracy,Precision,Recall,F1-Score,AUC
0,3,0.8756,0.7571,0.747,0.7515,0.8233
1,5,0.8685,0.7598,0.7207,0.7387,0.8008
2,7,0.8756,0.7753,0.7331,0.752,0.811
3,11,0.885,0.798,0.7642,0.7787,0.8323
4,15,0.8944,0.8171,0.7736,0.7908,0.8414
5,20,0.8897,0.8091,0.7623,0.7813,0.8335


Kết quả LDA 70:30


Unnamed: 0,K,Accuracy,Precision,Recall,F1-Score,AUC
0,3,0.8809,0.7974,0.7631,0.7772,0.8296
1,5,0.8746,0.7828,0.7336,0.7549,0.8093
2,7,0.8793,0.8067,0.7394,0.7662,0.813
3,11,0.8856,0.8172,0.7657,0.7847,0.8336
4,15,0.8856,0.8178,0.7601,0.7814,0.8299
5,20,0.8824,0.8112,0.7563,0.7744,0.8291


Kết quả LDA 60:40


Unnamed: 0,K,Accuracy,Precision,Recall,F1-Score,AUC
0,3,0.8801,0.7909,0.7505,0.7685,0.8219
1,5,0.8895,0.8063,0.7576,0.7783,0.8299
2,7,0.8872,0.8237,0.7501,0.7797,0.8216
3,11,0.886,0.8325,0.7472,0.7806,0.8186
4,15,0.8872,0.8268,0.759,0.7853,0.828
5,20,0.8907,0.8264,0.7694,0.7918,0.8359
