In [1]:
# import necessary packages, including pandas, train_test_split, KNNImputer, catboost, xgboost,lightgbm, adaboost, randomforest and decisiontree classifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy import stats
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

Xian_cohort = pd.read_csv('/Users/gengzhi/Desktop/Xian_cohort.csv')
nonXian_cohort = pd.read_csv('/Users/gengzhi/Desktop/nonXian_cohort.csv')
Xian_cohort = pd.concat([Xian_cohort, nonXian_cohort])
imputer = KNNImputer(n_neighbors=2)
# only 69th column have missing value
Xian_cohort['CRP'] = imputer.fit_transform(Xian_cohort)[:, 69]
adata = Xian_cohort[Xian_cohort['surgical_classify']!=1]

In [2]:
# drop the columns that are not needed,include '生存时间‘，’death(死亡)'
adata.drop(['survival_time','death'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.drop(['survival_time','death'],axis=1,inplace=True)


In [3]:
# do the t-test to find the significant features on column 'SSUM(400domestic_4000onbroad)'
columns = adata.columns.drop(['reappear','surgical_classify','sex','age','stroke_if','TIA_if','hypertension','diabete','SSUM400'])
t_test = {}
# cycle through the columns to do the t-test on column 'SSUM(400domestic_4000onbroad)'
for i in columns:
    t_test[i] = stats.ttest_ind(adata[adata['SSUM400']==1][i],adata[adata['SSUM400']==0][i])

In [4]:
# do chi_2 test on column 'SSUM(400domestic_4000onbroad)'
columns2 = ['reappear','surgical_classify','sex','age','stroke_if','TIA_if','hypertension','diabete']
chi_2 = {}
for i in columns2:
    chi_2[i] = stats.chi2_contingency(pd.crosstab(adata['SSUM400'],adata[i]))

In [5]:
# write the results of t-test and chi_2 test to a csv file
t_test_df = pd.DataFrame(t_test).T
t_test_df.columns = ['t-statistic','p-value']
t_test_df.to_csv('/Users/gengzhi/Desktop/t_test.csv')
chi_2_df = pd.DataFrame(chi_2).T
chi_2_df.columns = ['chi_2','p-value','dof','expected']
chi_2_df.to_csv('/Users/gengzhi/Desktop/chi2_test.csv')

In [72]:
# retain the significant features from the t-test and chi_2 test
sig_features = []
for i in t_test:
    if t_test[i][1] < 0.05:
        sig_features.append(i)
for i in chi_2:
    if chi_2[i][1] < 0.05:
        sig_features.append(i)
sig_features.append('SSUM400')

In [73]:
# drop the columns that are not in the significant features
adata.drop(columns.difference(sig_features),axis=1,inplace=True)

In [74]:
# split the data into training and testing data
X = adata.drop('SSUM(400domestic_4000onbroad)',axis=1)
y = adata['SSUM(400domestic_4000onbroad)']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=5959+89+15)

In [75]:
# initialize the classifiers
cat = CatBoostClassifier()
xgb = XGBClassifier()
lgb = LGBMClassifier()
ada = AdaBoostClassifier()
rf = RandomForestClassifier()
dt = DecisionTreeClassifier()

In [76]:
# fit the classifiers
cat.fit(X_train,y_train)
xgb.fit(X_train,y_train)
lgb.fit(X_train,y_train)
ada.fit(X_train,y_train)
rf.fit(X_train,y_train)
dt.fit(X_train,y_train)

Learning rate set to 0.00983
0:	learn: 0.6846332	total: 2.22ms	remaining: 2.22s
1:	learn: 0.6765616	total: 3.7ms	remaining: 1.84s
2:	learn: 0.6692555	total: 5.09ms	remaining: 1.69s
3:	learn: 0.6622066	total: 6.66ms	remaining: 1.66s
4:	learn: 0.6545849	total: 8.3ms	remaining: 1.65s
5:	learn: 0.6475645	total: 9.78ms	remaining: 1.62s
6:	learn: 0.6403110	total: 11.4ms	remaining: 1.61s
7:	learn: 0.6328919	total: 12.8ms	remaining: 1.58s
8:	learn: 0.6262941	total: 13.9ms	remaining: 1.53s
9:	learn: 0.6197662	total: 15.1ms	remaining: 1.49s
10:	learn: 0.6140689	total: 16.2ms	remaining: 1.46s
11:	learn: 0.6082694	total: 17.4ms	remaining: 1.43s
12:	learn: 0.6019382	total: 18.7ms	remaining: 1.42s
13:	learn: 0.5948300	total: 20.3ms	remaining: 1.43s
14:	learn: 0.5879832	total: 21.6ms	remaining: 1.42s
15:	learn: 0.5813941	total: 22.7ms	remaining: 1.4s
16:	learn: 0.5753510	total: 23.9ms	remaining: 1.38s
17:	learn: 0.5701928	total: 24.9ms	remaining: 1.36s
18:	learn: 0.5652215	total: 26ms	remaining: 1.34



In [77]:
# calculate the roc_auc score for each classifier
cat_pred = cat.predict_proba(X_test)[:,-1]
xgb_pred = xgb.predict_proba(X_test)[:,-1]
lgb_pred = lgb.predict_proba(X_test)[:,-1]
ada_pred = ada.predict_proba(X_test)[:,-1]
rf_pred = rf.predict_proba(X_test)[:,-1]
dt_pred = dt.predict_proba(X_test)[:,-1]
print(roc_auc_score(y_test,cat_pred))
print(roc_auc_score(y_test,xgb_pred))
print(roc_auc_score(y_test,lgb_pred))
print(roc_auc_score(y_test,ada_pred))
print(roc_auc_score(y_test,rf_pred))
print(roc_auc_score(y_test,dt_pred))

0.8738919857443114
0.8471168783697341
0.8758110207438545
0.7741021657680709
0.8755825641962898
0.5842547747418441


In [78]:
print(f1_score(y_test,cat.predict(X_test)))
print(f1_score(y_test,xgb.predict(X_test)))
print(f1_score(y_test,lgb.predict(X_test)))
print(f1_score(y_test,ada.predict(X_test)))
print(f1_score(y_test,rf.predict(X_test)))
print(f1_score(y_test,dt.predict(X_test)))

0.21621621621621623
0.24390243902439024
0.25
0.3793103448275862
0.11428571428571428
0.21686746987951808
