In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KNeighborsClassifier
from imblearn.ensemble import RUSBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
data = pd.read_excel('oscc_dataset.xlsx')
data['N-stage'] = data['N-stage'].map({
    'N-' : 0,
    'N+' : 1
})

In [4]:
clean_data = data.copy()
clean_data = clean_data.drop(columns=['Neutrophil %', 'Lymphocyte %', 'Hospital No.'])
knn_imputer = KNNImputer(n_neighbors=5, weights='distance')
imputed_values = knn_imputer.fit_transform(clean_data[['DOI(mm)']])
imputed_rounded = np.rint(imputed_values).astype(int)
clean_data['DOI(mm)'] = imputed_rounded

In [5]:
n_stage_features = [
    'Age',
    'DOI(mm)',
    'T-stage',
    'NLR',
    'PMR',
    'PLR',
    'SII=P*(N/L)(10power3/microliter)'
]

x = clean_data.drop(columns=['ENE', 'N-stage','Platelet count(10power3/microliter)', 'WBC(TLC)(10power3/microliter)', 'Monocyte %',
                       'Absolute monocyte count (TLC*monocyte %)(10power3/microliter)', 'Absolute neutrophil count(WBC*neutrophil%/100)(10power3/microliter)',
                       'Absolute lymphocyte count(TLC* lymphocytes%/100)(10power3/microliter)'])

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, clean_data[['ENE','N-stage']], train_size=0.9, test_size=0.1, random_state=42)
y_n_stage_train = y_train['N-stage']
y_n_stage_test = y_test['N-stage']
normalize = StandardScaler()
x_train_norm = normalize.fit_transform(x_train)
x_test_norm = normalize.transform(x_test)

In [7]:
model = RUSBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=6),
        n_estimators=200,
        learning_rate=0.5,
        sampling_strategy='auto',
        replacement=False,
        random_state=42
    )

In [None]:
model.fit(x_train_norm, y_n_stage_train)
y_n_stage_pred = model.predict(x_test_norm)
acc = accuracy_score(y_n_stage_test, y_n_stage_pred)
report = classification_report(y_n_stage_test, y_n_stage_pred)
print(f"Accuracy: {acc:.4f}")
print("Classification Report:")
print(report)

Accuracy: 0.8085
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.79      0.83        28
           1       0.73      0.84      0.78        19

    accuracy                           0.81        47
   macro avg       0.80      0.81      0.81        47
weighted avg       0.82      0.81      0.81        47



In [None]:
import joblib
joblib.dump(model, "model.pkl")