In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KNeighborsClassifier
from imblearn.ensemble import RUSBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier


In [2]:
data = pd.read_excel('oscc_dataset.xlsx')

In [3]:
print(data.head(5))

   Hospital No.  Age  Sex  Sites  DOI(mm)  T-stage  ENE N-stage  \
0       3446087   50    2      2      2.0        4    0      N-   
1       3342831   76    2      2      3.0        4    1      N+   
2       2871977   64    1      2      1.0        1    0      N-   
3       3092768   49    1      2      3.0        4    1      N+   
4       3700269   35    1      1      1.0        2    0      N-   

   Platelet count(10power3/microliter)  WBC(TLC)(10power3/microliter)  ...  \
0                                  313                           11.7  ...   
1                                  468                           18.3  ...   
2                                  232                            8.5  ...   
3                                  208                            6.8  ...   
4                                  135                            5.4  ...   

   Absolute monocyte count (TLC*monocyte %)(10power3/microliter)  \
0                                             1.0998        

In [4]:
print(data["T-stage"].unique())
print(data["Sites"].unique())

[4 1 2 3]
[2 1 3]


In [5]:
clean_data = data.copy()
clean_data = clean_data.drop(columns=['Neutrophil %', 'Lymphocyte %', 'Hospital No.'])
knn_imputer = KNNImputer(n_neighbors=5, weights='distance')
imputed_values = knn_imputer.fit_transform(clean_data[['DOI(mm)']])
imputed_rounded = np.rint(imputed_values).astype(int)
clean_data['DOI(mm)'] = imputed_rounded

In [6]:
x = clean_data.drop(columns=['ENE', 'N-stage','Platelet count(10power3/microliter)', 'WBC(TLC)(10power3/microliter)', 'Monocyte %',
                       'Absolute monocyte count (TLC*monocyte %)(10power3/microliter)', 'Absolute neutrophil count(WBC*neutrophil%/100)(10power3/microliter)',
                       'Absolute lymphocyte count(TLC* lymphocytes%/100)(10power3/microliter)'])

In [7]:
print(x.head(5))

   Age  Sex  Sites  DOI(mm)  T-stage       NLR         PMR         PLR  \
0   50    2      2        2        4  1.497608  284.597199   74.880383   
1   76    2      2        3        4  5.329793  473.588342  180.096975   
2   64    1      2        1        1  6.037037  313.725490  214.814815   
3   49    1      2        3        4  3.146853  397.249809  145.454545   
4   35    1      1        1        2  2.553030  333.333333  102.272727   

        LMR  SII=P*(N/L)(10power3/microliter)  
0  3.800691                        468.751196  
1  2.629630                       2494.343108  
2  1.460446                       1400.592593  
3  2.731092                        654.545455  
4  3.259259                        344.659091  


In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, clean_data[['ENE','N-stage']], train_size=0.9, test_size=0.1, random_state=42)
y_ene_train = y_train['ENE']
y_ene_test = y_test['ENE']
normalize = StandardScaler()
x_train_norm = normalize.fit_transform(x_train)
x_test_norm = normalize.transform(x_test)

In [12]:
model = CatBoostClassifier(
        iterations=200,
        depth=6,
        learning_rate=0.1,
        l2_leaf_reg=3,
        bootstrap_type='Bernoulli',
        subsample=0.8,
        random_state=42,
        verbose=False
    )

In [13]:
model.fit(x_train_norm, y_ene_train)
y_ene_pred = model.predict(x_test_norm)
acc = accuracy_score(y_ene_test, y_ene_pred)
report = classification_report(y_ene_test, y_ene_pred)
print(f"Accuracy: {acc:.4f}")
print("Classification Report:")
print(report)

Accuracy: 0.8298
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.92      0.89        36
           1       0.67      0.55      0.60        11

    accuracy                           0.83        47
   macro avg       0.77      0.73      0.75        47
weighted avg       0.82      0.83      0.82        47



In [14]:
print(x_test_norm)

[[-0.27965028 -0.52720548  0.02109973  1.02263502  1.37692288 -0.38447801
   0.79952887 -0.63150498  1.08038914 -0.35354436]
 [-0.79885466 -0.52720548  0.02109973  1.02263502  1.37692288  0.85931394
   0.79193021  2.65666486 -0.72272785  1.09162565]
 [-1.23152499 -0.52720548  0.02109973 -1.55989999  1.37692288  0.54730605
  -0.33034909  1.18056516 -0.79742891  1.71594719]
 [ 0.93182663 -0.52720548  1.13674821 -0.26863248 -0.61092141 -0.12728859
  -0.82040711 -0.78806933 -0.1750657  -0.43982289]
 [-0.62578653 -0.52720548  1.13674821  1.02263502  0.38300073  0.68021581
  -0.03511928  0.41747391 -0.41484444  0.65187921]
 [-1.23152499 -0.52720548 -1.09454874 -0.26863248 -0.61092141 -0.91410606
  -1.20958078 -1.11421105 -0.23895594 -0.92899126]
 [-0.36618434 -0.52720548  0.02109973 -0.26863248  0.38300073  0.84480124
  -1.0790904  -0.64579647 -0.52779597  0.42713357]
 [-0.10658215 -0.52720548  0.02109973  1.02263502  1.37692288 -1.06245838
   0.23921805 -0.79547275  0.84982531 -0.96162856]


In [15]:
print(y_ene_pred)

[1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0]


In [16]:
import joblib
joblib.dump(model, "catboost.pkl")

['catboost.pkl']

In [None]:
model.predict([[50, 2,2,2,4,1.5,284.6,74.9,3.8,468.7]])

array([1])

In [None]:
model.predict([[64,1,2,1,1,6.037037,313.725490,214.814815,1.460446,1400.592593]])

array([1])

In [None]:
model.predict([[35,1,1,1,2,2.553030,333.333333,102.272727,3.259259,344.659091]])

array([1])

In [None]:
model.predict([[76,2,2,3,4,5.329793,473.588342,180.096975,2.629630,2494.343108]])

array([1])