In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import RFE
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df_train = pd.read_excel(r"D:\2024.2\HCC\Classification_HCC\Code_HCC\Final_dataset\Merged_HCC (Train_Test) (version 1).xlsx", sheet_name = "Train_0")

df_train.describe()

df_test = pd.read_excel(r"D:\2024.2\HCC\Classification_HCC\Code_HCC\Final_dataset\Merged_HCC (Train_Test) (version 1).xlsx", sheet_name = "Test_0")

df_test.describe()

Unnamed: 0,Age,Gender,Leucocytes,Platelets,INR,AST,ALT,Tol_Bil,Dir_Bil,Albumin,Creatinine,HBsAg,HCVAb,AFP,AST_ALT_ratio,Obesity,Label_HCC
count,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0
mean,43.045802,0.480916,7.085497,250.472519,0.996619,27.78855,30.444733,12.155485,8.44855,42.363817,79.518244,0.10687,0.015267,864.54554,1.204076,0.114504,0.099237
std,12.602724,0.501554,2.324677,57.861118,0.071549,27.062188,39.161991,10.100742,4.255425,3.789334,28.931151,0.310134,0.123084,8969.374539,0.525859,0.319645,0.300127
min,18.0,0.0,3.6,2.9,0.84,7.56,4.9,1.7,0.8,20.2,46.0,0.0,0.0,0.0,0.1772,0.0,0.0
25%,34.0,0.0,5.6,207.5,0.95,18.52,13.36,7.7,5.5,40.5,63.46,0.0,0.0,0.0,0.87035,0.0,0.0
50%,42.0,0.0,6.5,249.0,0.99,22.43,20.8,9.79,7.2,42.84,75.44,0.0,0.0,0.0,1.1464,0.0,0.0
75%,51.0,1.0,8.02,282.5,1.04,28.7,32.02,12.65,10.3,44.51,89.755,0.0,0.0,1.595,1.3919,0.0,0.0
max,80.0,1.0,16.75,465.0,1.24,296.0,398.0,99.8,25.0,53.62,348.86,1.0,1.0,102458.0,3.087,1.0,1.0


In [3]:
df_test.columns

Index(['Age', 'Gender', 'Leucocytes', 'Platelets', 'INR', 'AST', 'ALT',
       'Tol_Bil', 'Dir_Bil', 'Albumin', 'Creatinine', 'HBsAg', 'HCVAb', 'AFP',
       'AST_ALT_ratio', 'Obesity', 'Label_HCC'],
      dtype='object')

In [4]:
selected_columns = ['Age', 'Gender', 'Leucocytes', 'Platelets', 'INR', 'AST', 'ALT',
       'Tol_Bil', 'Dir_Bil', 'Albumin', 'Creatinine', 'HBsAg', 'HCVAb', 'AFP',
       'AST_ALT_ratio', 'Obesity', 'Label_HCC']

data_train = df_train.copy()
data_test = df_test.copy()

X_train = data_train[selected_columns].drop(columns=['Label_HCC'])
y_train = data_train.Label_HCC
X_test = data_test[selected_columns].drop(columns=['Label_HCC'])
y_test = data_test.Label_HCC

1. Information gain

In [5]:
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)

mi_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Information_Gain': mi_scores
}).sort_values(by='Information_Gain', ascending=False)

print(mi_df)

top_k = 14
top_features = mi_df.head(top_k)['Feature'].tolist()
print("Top", top_k, "features:", top_features)


          Feature  Information_Gain
13            AFP          0.409943
4             INR          0.237849
5             AST          0.188225
3       Platelets          0.171023
7         Tol_Bil          0.152516
9         Albumin          0.127125
11          HBsAg          0.090294
8         Dir_Bil          0.075128
0             Age          0.069879
15        Obesity          0.054215
2      Leucocytes          0.048540
6             ALT          0.030346
12          HCVAb          0.020232
14  AST_ALT_ratio          0.009322
1          Gender          0.000000
10     Creatinine          0.000000
Top 14 features: ['AFP', 'INR', 'AST', 'Platelets', 'Tol_Bil', 'Albumin', 'HBsAg', 'Dir_Bil', 'Age', 'Obesity', 'Leucocytes', 'ALT', 'HCVAb', 'AST_ALT_ratio']


2. Weight by Correlation

In [6]:
corr_scores = {}
for col in X_train.columns:
    corr = X_train[col].corr(y_train)  
    corr_scores[col] = abs(corr)       

corr_df = pd.DataFrame({
    'Feature': list(corr_scores.keys()),
    'Correlation_with_Label': list(corr_scores.values())
}).sort_values(by='Correlation_with_Label', ascending=False)

print(corr_df)

top_k = 10
top_features_corr = corr_df.head(top_k)['Feature'].tolist()
print("Top", top_k, "features by correlation:", top_features_corr)


          Feature  Correlation_with_Label
4             INR                0.484771
9         Albumin                0.437086
11          HBsAg                0.418333
0             Age                0.342849
3       Platelets                0.299867
15        Obesity                0.295242
14  AST_ALT_ratio                0.232294
13            AFP                0.227553
5             AST                0.183806
1          Gender                0.171811
12          HCVAb                0.106911
10     Creatinine                0.096151
2      Leucocytes                0.093546
8         Dir_Bil                0.046337
7         Tol_Bil                0.034400
6             ALT                0.004142
Top 10 features by correlation: ['INR', 'Albumin', 'HBsAg', 'Age', 'Platelets', 'Obesity', 'AST_ALT_ratio', 'AFP', 'AST', 'Gender']


3. Recursive Feature Elimination 

In [7]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rfe = RFE(estimator=rf, n_features_to_select=10, step=1)
rfe.fit(X_train, y_train)

# Lấy danh sách đặc trưng đã chọn
selected_features = X_train.columns[rfe.support_].tolist()

# Bảng ranking
ranking = pd.DataFrame({
    'Feature': X_train.columns,
    'Selected': rfe.support_,
    'Ranking': rfe.ranking_
}).sort_values(by='Ranking')

print("Top features selected by RFE with Random Forest:", selected_features)
print(ranking)



Top features selected by RFE with Random Forest: ['Age', 'Leucocytes', 'Platelets', 'INR', 'AST', 'Tol_Bil', 'Albumin', 'HBsAg', 'AFP', 'AST_ALT_ratio']
          Feature  Selected  Ranking
0             Age      True        1
2      Leucocytes      True        1
3       Platelets      True        1
4             INR      True        1
5             AST      True        1
7         Tol_Bil      True        1
9         Albumin      True        1
11          HBsAg      True        1
13            AFP      True        1
14  AST_ALT_ratio      True        1
8         Dir_Bil     False        2
6             ALT     False        3
10     Creatinine     False        4
1          Gender     False        5
15        Obesity     False        6
12          HCVAb     False        7


4. RandomForest

In [8]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

# Tính importance trực tiếp từ Random Forest
feat_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'RF_importance': rf.feature_importances_
}).sort_values(by='RF_importance', ascending=False)

print("Feature importance trực tiếp từ Random Forest:")
print(feat_importances)


Feature importance trực tiếp từ Random Forest:
          Feature  RF_importance
13            AFP       0.341933
4             INR       0.173891
3       Platelets       0.076893
5             AST       0.069756
9         Albumin       0.060154
11          HBsAg       0.045467
0             Age       0.041165
7         Tol_Bil       0.035307
6             ALT       0.032218
14  AST_ALT_ratio       0.031063
2      Leucocytes       0.030854
8         Dir_Bil       0.027893
10     Creatinine       0.019674
15        Obesity       0.006780
1          Gender       0.005140
12          HCVAb       0.001811


5. Neighborhood Component Analysis 

In [9]:
nca = NeighborhoodComponentsAnalysis(random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('nca', nca)
])

pipe.fit(X_train, y_train)

import numpy as np

nca_weights = np.linalg.norm(pipe.named_steps['nca'].components_, axis=0)

nca_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'NCA_importance': nca_weights
}).sort_values(by='NCA_importance', ascending=False)

print(nca_importance)

top_k = 10
top_features_nca = nca_importance.head(top_k)['Feature'].tolist()
print("Top", top_k, "features by NCA:", top_features_nca)

          Feature  NCA_importance
4             INR        8.091305
13            AFP        6.221471
3       Platelets        5.937572
5             AST        5.884750
9         Albumin        4.980673
0             Age        3.824811
14  AST_ALT_ratio        3.448801
7         Tol_Bil        2.638453
8         Dir_Bil        2.546292
1          Gender        2.526128
11          HBsAg        2.365811
15        Obesity        2.291799
6             ALT        2.168726
10     Creatinine        1.719628
2      Leucocytes        1.049021
12          HCVAb        0.872827
Top 10 features by NCA: ['INR', 'AFP', 'Platelets', 'AST', 'Albumin', 'Age', 'AST_ALT_ratio', 'Tol_Bil', 'Dir_Bil', 'Gender']


6. Lasso Regression

In [10]:
import numpy as np
from sklearn.linear_model import LogisticRegressionCV


Cs = np.logspace(-4, 0, 50)

lasso_cv = LogisticRegressionCV(
    Cs=Cs,
    penalty='l1',
    solver='saga',
    cv = 20,
    max_iter=10000,
    random_state=42,
    scoring='roc_auc',
    class_weight='balanced'
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
lasso_cv.fit(X_train_scaled, y_train)

coef_cv = lasso_cv.coef_[0]
selected_features = X_train.columns[coef_cv != 0].tolist()

print("Best C:", lasso_cv.C_[0])
print("Selected features:", selected_features)


Best C: 0.32374575428176433
Selected features: ['Age', 'Gender', 'Platelets', 'INR', 'AST', 'ALT', 'Tol_Bil', 'Albumin', 'HBsAg', 'HCVAb', 'AFP', 'Obesity']


7. XGB

In [11]:
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    random_state=42,
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)

importance = xgb_model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)


          Feature  Importance
13            AFP    0.404898
11          HBsAg    0.116730
4             INR    0.095684
0             Age    0.067755
3       Platelets    0.062682
7         Tol_Bil    0.053900
6             ALT    0.049321
5             AST    0.032442
14  AST_ALT_ratio    0.024278
9         Albumin    0.022905
2      Leucocytes    0.021840
8         Dir_Bil    0.021377
10     Creatinine    0.020477
1          Gender    0.005712
12          HCVAb    0.000000
15        Obesity    0.000000


8. Chi-square

In [12]:
from sklearn.feature_selection import SelectKBest, chi2
import pandas as pd

chi2_selector = SelectKBest(score_func=chi2, k=10)
chi2_selector.fit(X_train, y_train)

selected_features_chi2 = X_train.columns[chi2_selector.get_support()].tolist()
print("Selected features (Chi-Square):", selected_features_chi2)

chi2_scores = pd.DataFrame({
    'Feature': X_train.columns,
    'Chi2': chi2_selector.scores_
}).sort_values(by='Chi2', ascending=False)
print(chi2_scores)



Selected features (Chi-Square): ['Age', 'Leucocytes', 'Platelets', 'AST', 'Tol_Bil', 'Dir_Bil', 'Albumin', 'Creatinine', 'HBsAg', 'AFP']
          Feature           Chi2
13            AFP  184646.259814
10     Creatinine    5457.152713
2      Leucocytes    1232.959905
3       Platelets     795.075766
5             AST     731.816281
0             Age     121.792381
7         Tol_Bil      38.033738
9         Albumin      34.940399
11          HBsAg      30.625480
8         Dir_Bil      21.000268
15        Obesity      17.259223
14  AST_ALT_ratio       4.193145
12          HCVAb       2.903195
4             INR       2.429433
1          Gender       1.830186
6             ALT       1.414077
