In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm.sklearn import LGBMClassifier
from imblearn.ensemble import RUSBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

In [8]:
data = pd.read_excel('oscc_dataset.xlsx')
data['N-stage'] = data['N-stage'].map({
    'N-' : 0,
    'N+' : 1
})

In [9]:
clean_data = data.copy()
clean_data = clean_data.drop(columns=['Neutrophil %', 'Lymphocyte %', 'Hospital No.'])
knn_imputer = KNNImputer(n_neighbors=5, weights='distance')
imputed_values = knn_imputer.fit_transform(clean_data[['DOI(mm)']])
imputed_rounded = np.rint(imputed_values).astype(int)
clean_data['DOI(mm)'] = imputed_rounded

In [10]:
ene_features = [
    'Age',
    'Sex',
    'Sites',
    'DOI(mm)',
    'T-stage',
    'NLR',
    'PMR',
    'PLR',
    'LMR',
    'SII=P*(N/L)(10power3/microliter)'
]

n_stage_features = [
    'Age',
    'DOI(mm)',
    'T-stage',
    'NLR',
    'PMR',
    'PLR',
    'SII=P*(N/L)(10power3/microliter)'
]

x = clean_data.drop(columns=['ENE', 'N-stage','Platelet count(10power3/microliter)', 'WBC(TLC)(10power3/microliter)', 'Monocyte %',
                       'Absolute monocyte count (TLC*monocyte %)(10power3/microliter)', 'Absolute neutrophil count(WBC*neutrophil%/100)(10power3/microliter)',
                       'Absolute lymphocyte count(TLC* lymphocytes%/100)(10power3/microliter)'])

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, clean_data[['ENE','N-stage']], train_size=0.9, test_size=0.1, random_state=42)
y_ene_train = y_train['ENE']
y_n_stage_train = y_train['N-stage']
y_ene_test = y_test['ENE']
y_n_stage_test = y_test['N-stage']
normalize = StandardScaler()
x_train_norm = normalize.fit_transform(x_train)
x_test_norm = normalize.transform(x_test)

In [12]:
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=150,
        max_depth=10,
        min_samples_split=8,
        min_samples_leaf=3,
        max_features='sqrt',
        random_state=42,
        bootstrap=True
    ),
    'XGB': XGBClassifier(
        n_estimators=150,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric='logloss',
        random_state=42
    ),
    'CatBoost': CatBoostClassifier(
        iterations=200,
        depth=6,
        learning_rate=0.1,
        l2_leaf_reg=3,
        bootstrap_type='Bernoulli',
        subsample=0.8,
        random_state=42,
        verbose=False
    ),
    'LightGBM' : LGBMClassifier(
        n_estimators=150,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=-1,
        random_state=42,
        is_unbalance=True,
        verbose=-1
    ),
    'RUSBoost' : RUSBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=6),
        n_estimators=200,
        learning_rate=0.5,
        sampling_strategy='auto',
        replacement=False,
        random_state=42
    ),
    'AdaBoost' : AdaBoostClassifier(
        estimator=LogisticRegression(max_iter=1000, solver='lbfgs'),
        n_estimators=200,
        learning_rate=0.5,
        algorithm='SAMME',
        random_state=42
    )
}

In [13]:
results = {}
for name, model in models.items():
    model.fit(x_train_norm, y_n_stage_train)
    y_n_stage_pred = model.predict(x_test_norm)

    acc = accuracy_score(y_n_stage_test, y_n_stage_pred)
    report = classification_report(y_n_stage_test, y_n_stage_pred)
    results[name] = {"accuracy": acc, "classification_report": report}
    print("=" * 55)
    print(f"Model: {name}")
    print(f"Accuracy: {acc:.4f}")
    print("Classification Report:")
    print(report)
    print("=" * 55)

Model: Random Forest
Accuracy: 0.7021
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.71      0.74        28
           1       0.62      0.68      0.65        19

    accuracy                           0.70        47
   macro avg       0.69      0.70      0.70        47
weighted avg       0.71      0.70      0.70        47

Model: XGB
Accuracy: 0.7660
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.82      0.81        28
           1       0.72      0.68      0.70        19

    accuracy                           0.77        47
   macro avg       0.76      0.75      0.75        47
weighted avg       0.76      0.77      0.76        47

Model: CatBoost
Accuracy: 0.7660
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.86      0.81        28
           1       0.75      0.63      0.69        19

    accuracy  