In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV  # SVC không có predict_proba
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import os

In [4]:
# Đọc dữ liệu
df = pd.read_csv('../data/Flocking.csv')

# Loại bỏ các hàng có cột chứa khoảng trắng hoặc chuỗi rỗng
df_cleaned = df[~df.apply(lambda row: row.astype(str).str.fullmatch(r'\s*').any(), axis=1)]

# Tách train (60%) và phần còn lại (40%)
train_df, temp_df = train_test_split(df_cleaned, test_size=0.4, random_state=42)

# Tách phần còn lại thành val (20%) và test (20%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
# Giả sử cột cuối cùng là label
X_train = train_df.iloc[:, :-1].values
y_train = train_df.iloc[:, -1].values

X_valid = val_df.iloc[:, :-1].values
y_valid = val_df.iloc[:, -1].values


  df = pd.read_csv('../data/Flocking.csv')


In [6]:
# Kiểm tra kích thước
print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

Train size: 14409
Validation size: 4803
Test size: 4803
(14409, 2400)
(14409,)
(4803, 2400)
(4803,)


In [7]:
# === Evaluation function ===
def evaluate_model(name, y_true, y_pred, y_proba):
    return {
        'Model': name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred),
        'ROC-AUC': roc_auc_score(y_true, y_proba)
    }

In [8]:
results = []
C_values = [0.01, 0.1, 1, 10, 100]
max_iter_values = [500, 1000, 5000, 10000]

best_auc = 0
best_params_svc = {}

# === LinearSVC với CalibratedClassifierCV ===
for C in C_values:
    for max_iter in max_iter_values:
        base_model = LinearSVC(C=C, max_iter=max_iter, random_state=42)
        model = CalibratedClassifierCV(base_model)

        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('clf', model)
        ])

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_valid)
        y_proba = pipeline.predict_proba(X_valid)[:, 1]
        auc = roc_auc_score(y_valid, y_proba)

        if auc > best_auc:
            best_auc = auc
            best_params_svc = {'C': C, 'max_iter': max_iter}
            best_svc_pipeline = pipeline
            best_y_pred_svc = y_pred
            best_y_proba_svc = y_proba

print("\nBest LinearSVC parameters by ROC-AUC:")
print(f"C = {best_params_svc['C']}, max_iter = {best_params_svc['max_iter']}, ROC-AUC = {best_auc:.4f}")
results.append(evaluate_model(
    f"LinearSVC (C={best_params_svc['C']}, max_iter={best_params_svc['max_iter']})",
    y_valid, best_y_pred_svc, best_y_proba_svc
))


Best LinearSVC parameters by ROC-AUC:
C = 0.01, max_iter = 500, ROC-AUC = 1.0000


In [10]:
# === Decision Tree ===
max_depth_values = [3, 5, 10, 15, 20, None]
min_samples_split_values = [2, 5, 10]

best_auc_tree = 0
best_params_tree = {}

for max_depth in max_depth_values:
    for min_samples_split in min_samples_split_values:
        model = DecisionTreeClassifier(max_depth=max_depth,
                                       min_samples_split=min_samples_split,
                                       random_state=42)
        pipeline = Pipeline([
            ('scaler', StandardScaler()),  # Có thể bỏ với cây, giữ để đồng nhất
            ('clf', model)
        ])

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_valid)
        y_proba = pipeline.named_steps['clf'].predict_proba(StandardScaler().fit_transform(X_valid))[:, 1]
        auc = roc_auc_score(y_valid, y_proba)

        if auc > best_auc_tree:
            best_auc_tree = auc
            best_params_tree = {
                'max_depth': max_depth,
                'min_samples_split': min_samples_split
            }
            best_y_pred_tree = y_pred
            best_y_proba_tree = y_proba
            best_tree_model = pipeline

print("\nBest DecisionTree parameters by ROC-AUC:")
print(f"max_depth = {best_params_tree['max_depth']}, min_samples_split = {best_params_tree['min_samples_split']}, ROC-AUC = {best_auc_tree:.4f}")
results.append(evaluate_model(
    f"Decision Tree (max_depth={best_params_tree['max_depth']}, min_split={best_params_tree['min_samples_split']})",
    y_valid, best_y_pred_tree, best_y_proba_tree
))


Best DecisionTree parameters by ROC-AUC:
max_depth = 15, min_samples_split = 2, ROC-AUC = 0.9911


In [11]:
# Chuẩn bị dữ liệu test
X_test = test_df.iloc[:, :-1].values
y_test = test_df.iloc[:, -1].values

# === Đánh giá mô hình LinearSVC tốt nhất ===
y_pred_test_svc = best_svc_pipeline.predict(X_test)
y_proba_test_svc = best_svc_pipeline.predict_proba(X_test)[:, 1]

print(" Đánh giá LinearSVC trên test set:")
print(classification_report(y_test, y_pred_test_svc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test_svc))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_test_svc))

# === Đánh giá mô hình DecisionTree tốt nhất ===
y_pred_test_tree = best_tree_model.predict(X_test)
y_proba_test_tree = best_tree_model.predict_proba(X_test)[:, 1]

print("\n Đánh giá DecisionTree trên test set:")
print(classification_report(y_test, y_pred_test_tree))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test_tree))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_test_tree))

 Đánh giá LinearSVC trên test set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2413
           1       1.00      1.00      1.00      2390

    accuracy                           1.00      4803
   macro avg       1.00      1.00      1.00      4803
weighted avg       1.00      1.00      1.00      4803

Confusion Matrix:
 [[2413    0]
 [   1 2389]]
ROC-AUC: 1.0

 Đánh giá DecisionTree trên test set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2413
           1       1.00      1.00      1.00      2390

    accuracy                           1.00      4803
   macro avg       1.00      1.00      1.00      4803
weighted avg       1.00      1.00      1.00      4803

Confusion Matrix:
 [[2412    1]
 [  10 2380]]
ROC-AUC: 0.9977007388500573


In [12]:

# --- Bagging cho LinearSVC ---
bagging_svc = BaggingClassifier(
    estimator=best_svc_pipeline.named_steps['clf'], 
    n_estimators=10,
    max_samples=0.8,
    random_state=42
)

bagging_svc.fit(X_train, y_train)
y_pred_test_bagging_svc = bagging_svc.predict(X_test)
y_proba_test_bagging_svc = bagging_svc.predict_proba(X_test)[:, 1]

print("\n Đánh giá Bagging LinearSVC trên test set:")
print(classification_report(y_test, y_pred_test_bagging_svc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test_bagging_svc))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_test_bagging_svc))


# --- Bagging cho Decision Tree ---
bagging_tree = BaggingClassifier(
    estimator=best_tree_model.named_steps['clf'], 
    n_estimators=10,
    max_samples= 0.8, 
    random_state=42
)

bagging_tree.fit(X_train, y_train)
y_pred_test_bagging_tree = bagging_tree.predict(X_test)
y_proba_test_bagging_tree = bagging_tree.predict_proba(X_test)[:, 1]

print("\n Đánh giá Bagging DecisionTree trên test set:")
print(classification_report(y_test, y_pred_test_bagging_tree))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test_bagging_tree))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_test_bagging_tree))







 Đánh giá Bagging LinearSVC trên test set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2413
           1       1.00      1.00      1.00      2390

    accuracy                           1.00      4803
   macro avg       1.00      1.00      1.00      4803
weighted avg       1.00      1.00      1.00      4803

Confusion Matrix:
 [[2404    9]
 [   6 2384]]
ROC-AUC: 0.999927519520311

 Đánh giá Bagging DecisionTree trên test set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2413
           1       1.00      1.00      1.00      2390

    accuracy                           1.00      4803
   macro avg       1.00      1.00      1.00      4803
weighted avg       1.00      1.00      1.00      4803

Confusion Matrix:
 [[2413    0]
 [   3 2387]]
ROC-AUC: 1.0


In [13]:

# --- Boosting cho Decision Tree ---
boosting_tree = AdaBoostClassifier(
    estimator=best_tree_model.named_steps['clf'],  # lấy mô hình Decision Tree tốt nhất
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)

boosting_tree.fit(X_train, y_train)
y_pred_test_boosting_tree = boosting_tree.predict(X_test)
y_proba_test_boosting_tree = boosting_tree.predict_proba(X_test)[:, 1]

print("\n Đánh giá Boosting DecisionTree trên test set:")
print(classification_report(y_test, y_pred_test_boosting_tree))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test_boosting_tree))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_test_boosting_tree))



 Đánh giá Boosting DecisionTree trên test set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2413
           1       1.00      1.00      1.00      2390

    accuracy                           1.00      4803
   macro avg       1.00      1.00      1.00      4803
weighted avg       1.00      1.00      1.00      4803

Confusion Matrix:
 [[2411    2]
 [   5 2385]]
ROC-AUC: 0.9985395530139222
