In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [35]:
df = pd.read_csv("Debernardi et al 2020 data.csv")
print(df.shape)
df.head(7)

(590, 14)


Unnamed: 0,sample_id,patient_cohort,sample_origin,age,sex,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,S1,Cohort1,BPTB,33,F,1,,,11.7,1.83222,0.893219,52.94884,654.282174,1262.0
1,S10,Cohort1,BPTB,81,F,1,,,,0.97266,2.037585,94.46703,209.48825,228.407
2,S100,Cohort2,BPTB,51,M,1,,,7.0,0.78039,0.145589,102.366,461.141,
3,S101,Cohort2,BPTB,61,M,1,,,8.0,0.70122,0.002805,60.579,142.95,
4,S102,Cohort2,BPTB,62,M,1,,,9.0,0.21489,0.00086,65.54,41.088,
5,S103,Cohort2,BPTB,53,M,1,,,,0.84825,0.003393,62.126,59.793,
6,S104,Cohort2,BPTB,70,M,1,,,,0.62205,0.174381,152.277,117.516,


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590 entries, 0 to 589
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   sample_id                590 non-null    object 
 1   patient_cohort           590 non-null    object 
 2   sample_origin            590 non-null    object 
 3   age                      590 non-null    int64  
 4   sex                      590 non-null    object 
 5   diagnosis                590 non-null    int64  
 6   stage                    199 non-null    object 
 7   benign_sample_diagnosis  208 non-null    object 
 8   plasma_CA19_9            350 non-null    float64
 9   creatinine               590 non-null    float64
 10  LYVE1                    590 non-null    float64
 11  REG1B                    590 non-null    float64
 12  TFF1                     590 non-null    float64
 13  REG1A                    306 non-null    float64
dtypes: float64(6), int64(2), o

In [37]:
df.isna().sum()


sample_id                    0
patient_cohort               0
sample_origin                0
age                          0
sex                          0
diagnosis                    0
stage                      391
benign_sample_diagnosis    382
plasma_CA19_9              240
creatinine                   0
LYVE1                        0
REG1B                        0
TFF1                         0
REG1A                      284
dtype: int64

In [38]:
# pre processing

from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

df['stage'] = df['stage'].fillna('Unknown')
df['benign_sample_diagnosis'] = df['benign_sample_diagnosis'].fillna('Unknown')
df['REG1A'] = df['REG1A'].fillna(df['REG1A'].mean())

imputer = KNNImputer(n_neighbors=3)
df['plasma_CA19_9'] = imputer.fit_transform(df[['plasma_CA19_9']])


encoder = LabelEncoder()
df['stage'] = encoder.fit_transform(df['stage'])
df['sample_origin'] = encoder.fit_transform(df['sample_origin'])


df['benign_sample_diagnosis'] = df['benign_sample_diagnosis'].str.strip()
df['benign_sample_diagnosis'] = encoder.fit_transform(df['benign_sample_diagnosis'])

df = pd.get_dummies(df, columns=['sex', 'patient_cohort'], drop_first=True)

columns = ['age', 'plasma_CA19_9', 'creatinine', 'LYVE1', 'REG1B', 'TFF1']
scaler = StandardScaler()
df[columns] = scaler.fit_transform(df[columns])

df.drop(columns=['sample_id'], inplace=True)

df

Unnamed: 0,sample_origin,age,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A,sex_M,patient_cohort_Cohort2
0,0,-1.991056,1,8,47,-3.436293e-01,1.529927,-0.631661,-0.299975,0.055876,1262.000000,False,False
1,0,1.673512,1,8,47,-6.082196e-17,0.183680,-0.298597,-0.088256,-0.384680,228.407000,False,False
2,0,-0.616843,1,8,47,-3.461438e-01,-0.117454,-0.849256,-0.047976,-0.135425,735.281222,True,True
3,0,0.146609,1,8,47,-3.456088e-01,-0.241451,-0.890812,-0.261065,-0.450584,735.281222,True,True
4,0,0.222954,1,8,47,-3.450738e-01,-1.003143,-0.891378,-0.235767,-0.551475,735.281222,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,0.681025,3,7,47,-6.082196e-17,-0.524871,1.162636,0.226755,-0.071998,735.281222,True,True
586,0,0.910060,3,7,47,-6.082196e-17,0.006542,1.536048,-0.483726,-0.348568,735.281222,False,True
587,0,0.299299,3,7,47,-6.082196e-17,0.803662,1.342066,0.907324,-0.060005,735.281222,True,True
588,0,1.215441,3,7,47,-6.082196e-17,0.750521,1.496923,0.480141,0.123466,735.281222,False,True


In [39]:
df.isna().sum()

sample_origin              0
age                        0
diagnosis                  0
stage                      0
benign_sample_diagnosis    0
plasma_CA19_9              0
creatinine                 0
LYVE1                      0
REG1B                      0
TFF1                       0
REG1A                      0
sex_M                      0
patient_cohort_Cohort2     0
dtype: int64

In [40]:
X = df.drop(columns=['diagnosis'])
y = df['diagnosis'] - 1
print(np.unique(y))

[0 1 2]


In [41]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

In [42]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(472, 12)
(118, 12)
(472,)
(118,)


In [None]:
# grid search for XGB

param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 2, 3, 4],
    'learning_rate': [0.01, 0.1, 0.15]
}

xgb_clf = XGBClassifier()

grid_xgb = GridSearchCV(xgb_clf, param_grid=param_grid_xgb, cv=5, n_jobs=-1)
grid_xgb.fit(x_train, y_train)

print(grid_xgb.best_params_)

In [None]:
# XGB accuracy with best parameter

best_xgb = XGBClassifier(**grid_xgb.best_params_)
best_xgb.fit(x_train, y_train)

y_pred = best_xgb.predict(x_test)
print('XGB accuracy =>', accuracy_score(y_test, y_pred))

In [None]:
# grid search for RandomForest

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
}

rf_clf = RandomForestClassifier()

grid_rf = GridSearchCV(rf_clf, param_grid=param_grid_rf, cv=5)
grid_rf.fit(x_train, y_train)
print(grid_rf.best_params_)

In [None]:
best_rf = RandomForestClassifier(**grid_rf.best_params_)
best_rf.fit(x_train, y_train)

y_pred = best_rf.predict(x_test)
print('Random forest accuracy =>', accuracy_score(y_test, y_pred))

In [None]:
# Grid search for KNN

param_knn = {
    'n_neighbors': [3, 5, 7]
}

clf_knn = KNeighborsClassifier()

grid_knn = GridSearchCV(clf_knn, param_grid=param_knn, cv=5)
grid_knn.fit(x_train, y_train)

best_knn_param = grid_knn.best_params_
print(best_knn_param)

In [None]:
# KNN accuracy with best parameter

best_knn = KNeighborsClassifier(**best_knn_param)
best_knn.fit(x_train, y_train)

y_pred = best_knn.predict(x_test)
print('KNN accuracy =>',accuracy_score(y_test, y_pred))

In [None]:

# Grid search for SVC

param_svc = {'C': [0.1, 1, 10, 30, 50, 70, 100, 130]}

clf_svc = SVC()

grid_svc = GridSearchCV(SVC(), param_grid=param_svc, cv=5)
grid_svc.fit(x_train, y_train)

best_svc_param = grid_svc.best_params_
print(best_svc_param)

In [None]:
# SVC accuracy with best parameter

best_svc = SVC(**best_svc_param)

best_svc.fit(x_train, y_train)
y_pred = best_svc.predict(x_test)
print('SVC accuracy =>', accuracy_score(y_test, y_pred))

In [None]:
# Stacking one

stacking_1 = StackingClassifier(
    estimators= [
        ('rf', best_rf),
        ('xgb', best_xgb),
        ('svm', best_svc),
        ('knn', best_knn)
    ],
    final_estimator = LogisticRegression()
)

stacking_1.fit(x_train, y_train)
y_pred_1 = stacking_1.predict(x_test)
print("Accuracy (Stacking 1) =>", accuracy_score(y_test, y_pred_1))
print(classification_report(y_test, y_pred_1))

In [None]:
#Stacking two

stacking_2 = StackingClassifier(
    estimators= [
        ('svm', best_svc),
        ('knn', best_knn)
    ],
    final_estimator = GradientBoostingClassifier()
)

stacking_2.fit(x_train, y_train)
y_pred_2 = stacking_2.predict(x_test)
print("Accuracy (Stacking 2) =>", accuracy_score(y_test, y_pred_2))
print(classification_report(y_test, y_pred_2))
