In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import  roc_curve, auc
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, make_scorer
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
df_train = pd.read_excel(r"D:\2024.2\HCC\Classification_HCC\Code_HCC\Final_dataset\Merged_HCC (Train_Test) (version 1).xlsx", sheet_name = "Train_mean")

df_train.describe()



In [None]:
df_test = pd.read_excel(r"D:\2024.2\HCC\Classification_HCC\Code_HCC\Final_dataset\Merged_HCC (Train_Test) (version 1).xlsx", sheet_name = "Test_mean")

df_test.describe()

In [None]:
df_test.columns

In [None]:
columns = []
# 16 đặc trưng
selected_columns = ['Age', 'Gender', 'Leucocytes', 'Platelets', 'INR', 'AST', 'ALT',
       'Total_Bil', 'Dir_Bil', 'Albumin', 'Creatinine', 'HBsAg', 'HCVAb', 'AFP',
       'AST_ALT_ratio', 'Obesity', 'Label_HCC']
# IG 14 đặc trưng
selected_columns_infor = ['AFP', 'INR', 'AST', 'Platelets', 'Total_Bil', 'Albumin', 'HBsAg', 'Dir_Bil', 'Age', 'Obesity', 'Leucocytes', 'ALT', 'HCVAb', 'AST_ALT_ratio']

# Corr 10 đặc trưng 
selected_columns_corr= ['INR', 'Albumin', 'HBsAg', 'Age', 'Platelets', 'Obesity', 'AST_ALT_ratio', 'AFP', 'AST', 'Gender']
# RFE 10 đặc trưng
selected_columns_rfe = ['Age', 'Leucocytes', 'Platelets', 'INR', 'AST', 'ALT', 'Total_Bil', 'Albumin', 'HBsAg', 'AFP']
# RF 10 đặc trưng
selected_columns_rf = ['AFP', 'INR', 'Platelets', 'AST', 'Albumin', 'Age',   'HBsAg', 'Total_Bil',  'Leucocytes', 'ALT']

# NCA 10 đặc trưng 
selected_columns_NCA= ['INR', 'AFP', 'AST', 'Platelets', 'Albumin', 'Age', 'AST_ALT_ratio', 'HBsAg', 'Gender', 'Total_Bil']

# LR 12 đặc trưng
selected_columns_lasso = ['Age', 'Gender', 'Platelets', 'INR', 'AST', 'ALT', 'Total_Bil', 'Albumin', 'HBsAg', 'HCVAb', 'AFP', 'Obesity']

# XGB 14 đặc trưng
selected_columns_xgb = ['AFP', 'INR', 'AST', 'Platelets', 'Total_Bil', 'Albumin', 'HBsAg', 'Dir_Bil', 'Age', 'Creatinine', 'Leucocytes', 'ALT', 'Gender', 'AST_ALT_ratio']


# Chi 10 đặc trưng
selected_columns_chi = ['Age', 'Leucocytes', 'Platelets', 'AST', 'Total_Bil', 'Dir_Bil', 'Albumin', 'Creatinine', 'HBsAg', 'AFP']
data_train = df_train.copy()
data_test = df_test.copy()

X_train = data_train[selected_columns_chi]
y_train = data_train.Label_HCC
X_test = data_test[selected_columns_chi]
y_test = data_test.Label_HCC

In [None]:
X_train.columns

1. Model training and prediction

In [None]:

pipe = Pipeline([
    #('smote',SMOTE(random_state= 42)),
    ('classify', RandomForestClassifier(random_state=42, n_jobs=-1))
])

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

param_test_rf = {
    'classify__n_estimators': list(range(50, 300, 50)),
    'classify__criterion': ['gini', 'entropy'],
    'classify__max_features': ['sqrt', 'log2'],
    'classify__max_depth': list(range(1, 10, 2))
}

gsearch_rf = GridSearchCV(
    estimator=pipe,
    param_grid=param_test_rf,
    scoring='accuracy',
    cv=cv,
    n_jobs=-1
)

gsearch_rf.fit(X_train, y_train)

print("Best parameters:", gsearch_rf.best_params_)
print("Best CV accuracy score:", gsearch_rf.best_score_)


In [None]:
best_params_rf = gsearch_rf.best_params_

final_model_rf = Pipeline([
    #('smote',SMOTE(random_state= 42)),
    ('classify', RandomForestClassifier(
        n_estimators = best_params_rf['classify__n_estimators'],
        criterion = best_params_rf['classify__criterion'],
        max_features = best_params_rf['classify__max_features'],
        max_depth = best_params_rf['classify__max_depth'],
        random_state = 42,
        n_jobs = -1
    ))
])

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

cv_results = cross_validate(final_model_rf, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)

for metric in scoring.keys():
    mean = np.mean(cv_results[f'test_{metric}'])
    std = np.std(cv_results[f'test_{metric}'])
    print(f"{metric.capitalize():<10}: {mean:.2f} ± {std:.2f}")

2. XGB

In [None]:
pipe_xgb = Pipeline([
    #('smote',SMOTE(random_state= 42)),
    ('classify', XGBClassifier(
        random_state=42,
        n_jobs=-1,
        eval_metric='logloss'
    ))
])

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

param_test_xgb = {
    'classify__max_depth': [1, 2, 3, 4, 5, 6, 7],
    'classify__n_estimators': list(range(50, 300, 50)),
    'classify__min_child_weight': list(range(1, 10, 2)),
    'classify__learning_rate': [0.01, 0.03, 0.05, 0.07, 0.09, 0.1],
}

gsearch_xgb = GridSearchCV(
    estimator=pipe_xgb,
    param_grid=param_test_xgb,
    scoring='accuracy',
    cv=cv,
    n_jobs=-1
)
gsearch_xgb.fit(X_train, y_train)

print("Best parameters:", gsearch_xgb.best_params_)
print("Best CV score:", gsearch_xgb.best_score_)



In [None]:
best_params_xgb = gsearch_xgb.best_params_

final_model_xgb = Pipeline([
   # ('smote',SMOTE(random_state= 42)),
    ('classify', XGBClassifier(
        random_state=42,
        n_jobs=-1,
        eval_metric='logloss',
        use_label_encoder=False,
        max_depth=best_params_xgb['classify__max_depth'],
        n_estimators=best_params_xgb['classify__n_estimators'],
        min_child_weight=best_params_xgb['classify__min_child_weight'],
        learning_rate=best_params_xgb['classify__learning_rate']
    ))
])

cv_results = cross_validate(final_model_xgb, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)

print("\n Kết quả XGBoost (Cross-Validation 10-Fold):")
for metric in scoring.keys():
    mean = np.mean(cv_results[f'test_{metric}'])
    std = np.std(cv_results[f'test_{metric}'])
    print(f"{metric.capitalize():<10}: {mean:.2f} ± {std:.2f}")

In [None]:
final_model_rf

In [None]:
final_model_xgb

3. RF + XG (Soft Voting)

In [None]:
voting_model_1 = VotingClassifier(
    estimators=[
        ('random_forest', final_model_rf),
        ('xgboost', final_model_xgb)
    ],
    voting='soft',
    n_jobs=-1
)

cv_results = cross_validate(voting_model_1, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)

print("\n Kết quả Voting Ensemble (Cross-Validation 10-Fold):")
for metric in scoring.keys():
    mean = np.mean(cv_results[f'test_{metric}'])
    std = np.std(cv_results[f'test_{metric}'])
    print(f"{metric.capitalize():<10}: {mean:.2f} ± {std:.2f}")

4. RF + XGB (80-20)

In [None]:
voting_model_2 = VotingClassifier(
    estimators=[
        ('random_forest', final_model_rf),
        ('xgboost', final_model_xgb)
    ],
    weights=[0.8, 0.2],
    voting='soft',
    n_jobs=-1
)

cv_results = cross_validate(voting_model_2, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)

print("\n Kết quả Voting Ensemble (Cross-Validation 10-Fold):")
for metric in scoring.keys():
    mean = np.mean(cv_results[f'test_{metric}'])
    std = np.std(cv_results[f'test_{metric}'])
    print(f"{metric.capitalize():<10}: {mean:.2f} ± {std:.2f}")

5. RF + XGB (20-80)

In [None]:
voting_model_3 = VotingClassifier(
    estimators=[
        ('random_forest', final_model_rf),
        ('xgboost', final_model_xgb)
    ],
    weights=[0.2, 0.8],
    voting='soft',
    n_jobs=-1
)

cv_results = cross_validate(voting_model_3, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)

print("\n Kết quả Voting Ensemble (Cross-Validation 10-Fold):")
for metric in scoring.keys():
    mean = np.mean(cv_results[f'test_{metric}'])
    std = np.std(cv_results[f'test_{metric}'])
    print(f"{metric.capitalize():<10}: {mean:.2f} ± {std:.2f}")

6. LightGBM

In [None]:
pipe_lgbm = Pipeline([
    #('smote', SMOTE(random_state=42)),  
    ('classify', LGBMClassifier(
        objective='binary',
        random_state=42,
        n_jobs=-1,
        scale_pos_weight=3.0   
    ))
])

param_test_lgbm = {
    'classify__max_depth': [5, 7, 9],
    'classify__n_estimators': [50, 100, 150, 200],
    'classify__learning_rate': [0.01, 0.05, 0.1],
    'classify__min_split_gain': [0, 0.01, 0.1]
}

gsearch_lgbm = GridSearchCV(
    estimator=pipe_lgbm,
    param_grid=param_test_lgbm,
    scoring='accuracy', 
    cv=cv,
    n_jobs=-1,
    verbose=2
)

gsearch_lgbm.fit(X_train, y_train)

print("\nBest parameters:", gsearch_lgbm.best_params_)
print("Best CV accuracy:", gsearch_lgbm.best_score_)


In [None]:
best_params_lgbm = gsearch_lgbm.best_params_

final_model_lgbm = Pipeline([
    #('smote',SMOTE(random_state= 42)),
    ('classify', LGBMClassifier(
        objective='binary',
        random_state=42,
        n_jobs=-1,
        scale_pos_weight=3.0,
        max_depth=best_params_lgbm['classify__max_depth'],
        n_estimators=best_params_lgbm['classify__n_estimators'],
        learning_rate=best_params_lgbm['classify__learning_rate'],
        min_split_gain=best_params_lgbm['classify__min_split_gain']
    ))
])
cv_results = cross_validate(final_model_lgbm, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)

print("\n Kết quả LightGBM (10-Fold Cross-Validation):")
for metric in scoring.keys():
    mean = np.mean(cv_results[f'test_{metric}'])
    std = np.std(cv_results[f'test_{metric}'])
    print(f"{metric.capitalize():<10}: {mean:.2f} ± {std:.2f}")

7. Adaboost

In [None]:
pipe_ada = Pipeline([
   # ('smote', SMOTE(random_state=42)),  
    ('classify', AdaBoostClassifier(random_state=42))
])

param_grid_ada = {
    'classify__n_estimators': [50, 100, 150, 200, 250, 300],
    'classify__learning_rate': [0.01, 0.1, 0.5, 0.9, 1.0],
    'classify__algorithm': ['SAMME', 'SAMME.R'],
}

gsearch_ada = GridSearchCV(
    estimator=pipe_ada,
    param_grid=param_grid_ada,
    scoring='accuracy',  
    cv=cv,
    n_jobs=-1,
    verbose=2
)

gsearch_ada.fit(X_train, y_train)

print("\nBest params:", gsearch_ada.best_params_)
print("Best F1-score (CV):", gsearch_ada.best_score_)

In [None]:
best_params_ada = gsearch_ada.best_params_

final_model_ada = Pipeline([
    #('smote',SMOTE(random_state= 42)),
    ('classify', AdaBoostClassifier(
        random_state=42,
        n_estimators=best_params_ada['classify__n_estimators'],
        learning_rate=best_params_ada['classify__learning_rate'],
        algorithm=best_params_ada['classify__algorithm']
    ))
])
cv_results = cross_validate(final_model_ada, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)

print("\n Kết quả AdaBoost (10-Fold Cross-Validation):")
for metric in scoring.keys():
    mean = np.mean(cv_results[f'test_{metric}'])
    std = np.std(cv_results[f'test_{metric}'])
    print(f"{metric.capitalize():<10}: {mean:.2f} ± {std:.2f}")

8. KNN

In [None]:
pipe_knn = Pipeline([
   # ('smote', SMOTE(random_state=42)),  
    ('scaler', StandardScaler()),
    ('classify', KNeighborsClassifier())
])

param_grid_knn = {
    'classify__n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29],
    'classify__weights': ['uniform', 'distance'],
    'classify__metric': ['euclidean', 'manhattan'],
    'classify__p': [1, 2]  
}

gsearch_knn = GridSearchCV(
    estimator=pipe_knn,
    param_grid=param_grid_knn,
    scoring='accuracy',  
    cv=cv,
    n_jobs=-1,
    verbose=2
)

gsearch_knn.fit(X_train, y_train)

print("\nBest params (KNN):", gsearch_knn.best_params_)
print("Best F1-score (CV):", gsearch_knn.best_score_)

In [None]:
best_params_knn = gsearch_knn.best_params_

final_model_knn = Pipeline([
    #('smote',SMOTE(random_state= 42)),
    ('scaler', StandardScaler()),
    ('classify', KNeighborsClassifier(
        n_neighbors=best_params_knn['classify__n_neighbors'],
        weights=best_params_knn['classify__weights'],
        metric=best_params_knn['classify__metric'],
        p=best_params_knn['classify__p']
    ))
])
cv_results = cross_validate(final_model_knn, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)

print("\n Kết quả KNN (10-Fold Cross-Validation):")
for metric in scoring.keys():
    mean = np.mean(cv_results[f'test_{metric}'])
    std = np.std(cv_results[f'test_{metric}'])
    print(f"{metric.capitalize():<10}: {mean:.2f} ± {std:.2f}")

9.  LightGBM +  AdaBoost (Soft Voting)

In [None]:
voting_model_4 = VotingClassifier(
    estimators=[
        ('lightgbm', final_model_lgbm),
        ('adaboost', final_model_ada)
    ],
    voting='soft',
    n_jobs=-1
)

cv_results = cross_validate(voting_model_4, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)

print("\nKết quả Voting Ensemble (LightGBM + AdaBoost, 10-Fold CV):")
for metric in scoring.keys():
    mean = np.mean(cv_results[f'test_{metric}'])
    std = np.std(cv_results[f'test_{metric}'])
    print(f"{metric.capitalize():<10}: {mean:.2f} ± {std:.2f}")

10. LightGBM +  AdaBoost (60-40)

In [None]:
voting_model_5 = VotingClassifier(
    estimators=[
        ('lightgbm', final_model_lgbm),
        ('adaboost', final_model_ada)
    ],
    weights=[0.6, 0.4],
    voting='soft',
    n_jobs=-1
)

cv_results = cross_validate(voting_model_5, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)

print("\nKết quả Voting Ensemble (LightGBM + AdaBoost, 10-Fold CV):")
for metric in scoring.keys():
    mean = np.mean(cv_results[f'test_{metric}'])
    std = np.std(cv_results[f'test_{metric}'])
    print(f"{metric.capitalize():<10}: {mean:.2f} ± {std:.2f}")

11. Ligh+KNN+ADa (Soft)

In [None]:
voting_model_6 = VotingClassifier(
    estimators=[
       ('lightgbm', final_model_lgbm),
       ('knn', final_model_knn),
       ('adaboost', final_model_ada),
    ],
    voting='soft',
    n_jobs=-1
)


cv_results = cross_validate(voting_model_6, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)

print("\nKết quả Voting Ensemble (LightGBM + AdaBoost, 10-Fold CV):")
for metric in scoring.keys():
    mean = np.mean(cv_results[f'test_{metric}'])
    std = np.std(cv_results[f'test_{metric}'])
    print(f"{metric.capitalize():<10}: {mean:.2f} ± {std:.2f}")


In [None]:
voting_model_7 = VotingClassifier(
    estimators=[
       ('lightgbm', final_model_lgbm),
       ('knn', final_model_knn),
       ('adaboost', final_model_ada),
    ],
    weights = [0.4, 0.2, 0.4],
    voting='soft',
    n_jobs=-1
)


cv_results = cross_validate(voting_model_7, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)

print("\nKết quả Voting Ensemble (LightGBM + AdaBoost, 10-Fold CV):")
for metric in scoring.keys():
    mean = np.mean(cv_results[f'test_{metric}'])
    std = np.std(cv_results[f'test_{metric}'])
    print(f"{metric.capitalize():<10}: {mean:.2f} ± {std:.2f}")
