In [3]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import BorderlineSMOTE 

In [4]:
df = pd.read_csv(r"D:\BigData And DataMining\Đồ án\TH4\Myocardial infarction complications Database.csv")

In [5]:
# 2. Xóa cột có >90% giá trị thiếu
missing_ratio = df.isnull().mean()
cols_to_drop = missing_ratio[missing_ratio > 0.9].index
df.drop(columns=cols_to_drop, inplace=True)

In [6]:
# 3. MICE cho các cột <30% thiếu
cols_mice = missing_ratio[(missing_ratio > 0) & (missing_ratio <= 0.3)].index
mice_imputer = IterativeImputer(random_state=0)
df[cols_mice] = mice_imputer.fit_transform(df[cols_mice])



In [7]:
# 4. mean/mode cho các cột có 30–90% thiếu
cols_mean_mode = missing_ratio[(missing_ratio > 0.3) & (missing_ratio <= 0.9)].index
for col in cols_mean_mode:
    if df[col].dtype in [np.float64, np.float32]:
        df[col].fillna(df[col].mean(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [8]:

# 5. Chuẩn hóa đặc trưng
target = 'STENOK_AN'
X = df.drop(columns=['ID', 'STENOK_AN'])
y = df['STENOK_AN'].astype(int)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# 6. Cân bằng bằng Borderline-SMOTE
sm = BorderlineSMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_scaled, y)


In [10]:
# 7. Xuất dữ liệu đã xử lý (nếu cần)
X_final = pd.DataFrame(X_resampled, columns=X.columns)
y_final = pd.Series(y_resampled, name=target)

In [11]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import (
    AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier,
    RandomForestClassifier, VotingClassifier, StackingClassifier
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import warnings

In [12]:
# Chia dữ liệu
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

In [13]:
models = {
    "AdaBoost": AdaBoostClassifier(),
    "Bagging": BaggingClassifier(),
    "GDBT": GradientBoostingClassifier(),
    "KNN": KNeighborsClassifier(),
    "LGB": LGBMClassifier(),
    "QDA": QuadraticDiscriminantAnalysis(),
    "RF": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "Tree": DecisionTreeClassifier(),
    "XGB": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
}

# Thêm Stacking và Voting riêng
stacking_model = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier()),
        ('knn', KNeighborsClassifier()),
        ('svc', SVC(probability=True))
    ],
    final_estimator=LogisticRegression()
)

voting_model = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier()),
        ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
        ('svc', SVC(probability=True))
    ],
    voting='soft'
)

models["Stacking"] = stacking_model
models["Voting"] = voting_model

# Đánh giá từng mô hình
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    cv_score = cross_val_score(model, X_train, y_train, cv=5).mean()
    results.append((name, acc, cv_score))



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20115
[LightGBM] [Info] Number of data points in the train set: 3707, number of used features: 120
[LightGBM] [Info] Start training from score -1.981609
[LightGBM] [Info] Start training from score -1.956487
[LightGBM] [Info] Start training from score -1.950778
[LightGBM] [Info] Start training from score -1.943216
[LightGBM] [Info] Start training from score -1.915359
[LightGBM] [Info] Start training from score -1.930120
[LightGBM] [Info] Start training from score -1.945101
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001926 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19170
[LightGBM] [Info] Number of data points in the train set: 2965, number of used features: 120
[LightGBM] [Info] Start training from scor

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.
