In [95]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import RUSBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

In [96]:
data = pd.read_excel('oscc_dataset.xlsx')
data.head()

Unnamed: 0,Hospital No.,Age,Sex,Sites,DOI(mm),T-stage,ENE,N-stage,Platelet count(10power3/microliter),WBC(TLC)(10power3/microliter),...,Absolute monocyte count (TLC*monocyte %)(10power3/microliter),Neutrophil %,Absolute neutrophil count(WBC*neutrophil%/100)(10power3/microliter),Lymphocyte %,Absolute lymphocyte count(TLC* lymphocytes%/100)(10power3/microliter),NLR,PMR,PLR,LMR,SII=P*(N/L)(10power3/microliter)
0,3446087,50,2,2,2.0,4,0,N-,313,11.7,...,1.0998,,6.26,,4.18,1.497608,284.597199,74.880383,3.800691,468.751196
1,3342831,76,2,2,3.0,4,1,N+,468,18.3,...,0.9882,,13.85,14.2,2.5986,5.329793,473.588342,180.096975,2.62963,2494.343108
2,2871977,64,1,2,1.0,1,0,N-,232,8.5,...,0.7395,,6.52,,1.08,6.037037,313.72549,214.814815,1.460446,1400.592593
3,3092768,49,1,2,3.0,4,1,N+,208,6.8,...,0.5236,,4.5,,1.43,3.146853,397.249809,145.454545,2.731092,654.545455
4,3700269,35,1,1,1.0,2,0,N-,135,5.4,...,0.405,,3.37,,1.32,2.55303,333.333333,102.272727,3.259259,344.659091


In [97]:
data['N-stage'] = data['N-stage'].map({
    'N-' : 0,
    'N+' : 1
})

In [98]:
print(data["T-stage"].unique())
print(data["Sites"].unique())

[4 1 2 3]
[2 1 3]


In [99]:
clean_data = data.copy()
clean_data = clean_data.drop(columns=['Neutrophil %', 'Lymphocyte %', 'Hospital No.'])
knn_imputer = KNNImputer(n_neighbors=5, weights='distance')
imputed_values = knn_imputer.fit_transform(clean_data[['DOI(mm)']])
imputed_rounded = np.rint(imputed_values).astype(int)
clean_data['DOI(mm)'] = imputed_rounded

In [100]:
x = clean_data.drop(columns=['ENE', 'N-stage','Platelet count(10power3/microliter)', 'WBC(TLC)(10power3/microliter)', 'Monocyte %',
                       'Absolute monocyte count (TLC*monocyte %)(10power3/microliter)', 'Absolute neutrophil count(WBC*neutrophil%/100)(10power3/microliter)',
                       'Absolute lymphocyte count(TLC* lymphocytes%/100)(10power3/microliter)'])

x.head()

Unnamed: 0,Age,Sex,Sites,DOI(mm),T-stage,NLR,PMR,PLR,LMR,SII=P*(N/L)(10power3/microliter)
0,50,2,2,2,4,1.497608,284.597199,74.880383,3.800691,468.751196
1,76,2,2,3,4,5.329793,473.588342,180.096975,2.62963,2494.343108
2,64,1,2,1,1,6.037037,313.72549,214.814815,1.460446,1400.592593
3,49,1,2,3,4,3.146853,397.249809,145.454545,2.731092,654.545455
4,35,1,1,1,2,2.55303,333.333333,102.272727,3.259259,344.659091


In [101]:
x_train, x_test, y_train, y_test = train_test_split(x, clean_data['N-stage'], train_size=0.9, test_size=0.1, random_state=42)

In [102]:
model = RUSBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=6),
        n_estimators=250,
        learning_rate=0.5,
        sampling_strategy='auto',
        replacement=False,
        random_state=42
    )

In [103]:
model.fit(x_train, y_train)
y_n_stage_pred = model.predict(x_test)
acc = accuracy_score(y_test, y_n_stage_pred)
report = classification_report(y_test, y_n_stage_pred)
print(f"Accuracy: {acc:.4f}")
print("Classification Report:")
print(report)

Accuracy: 0.8298
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86        28
           1       0.79      0.79      0.79        19

    accuracy                           0.83        47
   macro avg       0.82      0.82      0.82        47
weighted avg       0.83      0.83      0.83        47



In [104]:
import joblib
joblib.dump(model, "rusboost.pkl")

['rusboost.pkl']