In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KNeighborsClassifier
from imblearn.ensemble import RUSBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier


In [17]:
data = pd.read_excel('oscc_dataset.xlsx')
data['N-stage'] = data['N-stage'].map({
    'N-' : 0,
    'N+' : 1
})

In [25]:
print(data.head(5))

   Hospital No.  Age  Sex  Sites  DOI(mm)  T-stage  ENE  N-stage  \
0       3446087   50    2      2      2.0        4    0        0   
1       3342831   76    2      2      3.0        4    1        1   
2       2871977   64    1      2      1.0        1    0        0   
3       3092768   49    1      2      3.0        4    1        1   
4       3700269   35    1      1      1.0        2    0        0   

   Platelet count(10power3/microliter)  WBC(TLC)(10power3/microliter)  ...  \
0                                  313                           11.7  ...   
1                                  468                           18.3  ...   
2                                  232                            8.5  ...   
3                                  208                            6.8  ...   
4                                  135                            5.4  ...   

   Absolute monocyte count (TLC*monocyte %)(10power3/microliter)  \
0                                             1.0998  

In [27]:
print(data["T-stage"].unique())
print(data["Sites"].unique())

[4 1 2 3]
[2 1 3]


In [18]:
clean_data = data.copy()
clean_data = clean_data.drop(columns=['Neutrophil %', 'Lymphocyte %', 'Hospital No.'])
knn_imputer = KNNImputer(n_neighbors=5, weights='distance')
imputed_values = knn_imputer.fit_transform(clean_data[['DOI(mm)']])
imputed_rounded = np.rint(imputed_values).astype(int)
clean_data['DOI(mm)'] = imputed_rounded

In [None]:
x = clean_data.drop(columns=['ENE', 'N-stage','Platelet count(10power3/microliter)', 'WBC(TLC)(10power3/microliter)', 'Monocyte %',
                       'Absolute monocyte count (TLC*monocyte %)(10power3/microliter)', 'Absolute neutrophil count(WBC*neutrophil%/100)(10power3/microliter)',
                       'Absolute lymphocyte count(TLC* lymphocytes%/100)(10power3/microliter)'])

In [24]:
print(x.head(5))

   Age  Sex  Sites  DOI(mm)  T-stage       NLR         PMR         PLR  \
0   50    2      2        2        4  1.497608  284.597199   74.880383   
1   76    2      2        3        4  5.329793  473.588342  180.096975   
2   64    1      2        1        1  6.037037  313.725490  214.814815   
3   49    1      2        3        4  3.146853  397.249809  145.454545   
4   35    1      1        1        2  2.553030  333.333333  102.272727   

        LMR  SII=P*(N/L)(10power3/microliter)  
0  3.800691                        468.751196  
1  2.629630                       2494.343108  
2  1.460446                       1400.592593  
3  2.731092                        654.545455  
4  3.259259                        344.659091  


In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, clean_data[['ENE','N-stage']], train_size=0.9, test_size=0.1, random_state=42)
y_n_stage_train = y_train['N-stage']
y_n_stage_test = y_test['N-stage']
normalize = StandardScaler()
x_train_norm = normalize.fit_transform(x_train)
x_test_norm = normalize.transform(x_test)

In [21]:
model = RUSBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=6),
        n_estimators=200,
        learning_rate=0.5,
        sampling_strategy='auto',
        replacement=False,
        random_state=42
    )

In [22]:
model.fit(x_train_norm, y_n_stage_train)
y_n_stage_pred = model.predict(x_test_norm)
acc = accuracy_score(y_n_stage_test, y_n_stage_pred)
report = classification_report(y_n_stage_test, y_n_stage_pred)
print(f"Accuracy: {acc:.4f}")
print("Classification Report:")
print(report)

Accuracy: 0.8085
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.79      0.83        28
           1       0.73      0.84      0.78        19

    accuracy                           0.81        47
   macro avg       0.80      0.81      0.81        47
weighted avg       0.82      0.81      0.81        47



In [23]:
import joblib
joblib.dump(model, "rusboost.pkl")

['rusboost.pkl']