In [1]:
from operator import index
from statistics import median

import pandas as pd
from anaconda_navigator.static.images import QTCONSOLE_ICON_1024_PATH
from sklearn.impute import KNNImputer
from scipy import stats
import numpy as np
Xian_cohort = pd.read_csv('/Users/gengzhi/Desktop/Xian_cohort.csv')
nonXian_cohort = pd.read_csv('/Users/gengzhi/Desktop/nonXian_cohort.csv')
Xian_cohort = pd.concat([Xian_cohort, nonXian_cohort])
imputer = KNNImputer(n_neighbors=2)
# only 69th column have missing value
Xian_cohort['CRP'] = imputer.fit_transform(Xian_cohort)[:, 69]
adata = Xian_cohort[Xian_cohort['surgical_classify'] == 1]
# drop the columns that are not needed,include '生存时间‘，’death(死亡)'
adata = adata.drop(['survival_time', 'death','surgical_classify'], axis=1)

# change surgical_classify 3 to 1,2 to 0
adata['surgical_classify'] = adata['surgical_classify'].replace(3, 1)
adata['surgical_classify'] = adata['surgical_classify'].replace(2, 0)

In [2]:
# perform log transformation on some columns,including NEUT,SII,EO,BASO,TT,FDP,D_D,AST,ALT,AST_ALT,GGT_Y,CK,TG,HCY,HbA1c,CRP,GLU
X = adata
X['NEUT_log10'] = np.log10(X['NEUT'])
X['SII_log10'] = np.log10(X['SII'])
X['BASO_log10'] = np.log10(X['BASO'] + 0.005)
X['TT_log10'] = np.log10(X['TT'])
X['FDP_log10'] = np.log10(X['FDP'])
X['D_D_log10'] = np.log10(X['D_D'])
X['AST_log10'] = np.log10(X['AST'])
X['ALT_log10'] = np.log10(X['ALT'])
X['AST_ALT_log10'] = np.log10(X['AST_ALT'])
X['GGT_Y_log10'] = np.log10(X['GGT_Y'])
X['CK_log10'] = np.log10(X['CK'])
X['TG_log10'] = np.log10(X['TG'])
X['HCY_log10'] = np.log10(X['HCY'])
X['HbA1c_log10'] = np.log10(X['HbA1c'])
X['GLU_log10'] = np.log10(X['GLU'])
for i in ['NEUT', 'SII', 'BASO', 'TT', 'FDP', 'D_D', 'AST', 'ALT', 'AST_ALT', 'GGT_Y', 'CK', 'TG', 'HCY', 'HbA1c',
          'GLU']:
    X.drop(i, axis=1, inplace=True)
adata = X

In [3]:
# do the t-test to find the significant features on column 'SSUM(400domestic_4000onbroad)'
columns = adata.columns.drop(
    ['reappear', 'sex', 'stroke_if', 'TIA_if', 'hypertension', 'diabete', 'SSUM400'])
t_test = {}
Q3_event = []
median_event = []
Q1_event = []
Q3_no_event = []
median_no_event = []
Q1_no_event = []
# cycle through the columns to do the t-test on column 'SSUM(400domestic_4000onbroad)'
for i in columns:
    t_test[i] = stats.ttest_ind(adata[adata['SSUM400'] == 1][i], adata[adata['SSUM400'] == 0][i])
    # calculate the Q3 & Q1 of the two groups
    Q3_event.append(adata[adata['SSUM400'] == 1][i].quantile(0.75))
    median_event.append(adata[adata['SSUM400'] == 1][i].median())
    Q1_event.append(adata[adata['SSUM400'] == 1][i].quantile(0.25))
    Q3_no_event.append(adata[adata['SSUM400'] == 0][i].quantile(0.75))
    median_no_event.append(adata[adata['SSUM400'] == 0][i].median())
    Q1_no_event.append(adata[adata['SSUM400'] == 0][i].quantile(0.25))
    

In [4]:
# do chi_2 test on column 'SSUM(400domestic_4000onbroad)'
columns2 = ['reappear', 'sex', 'stroke_if', 'TIA_if', 'hypertension', 'diabete']
chi_2 = {}
count_event = []
percentage_event = []
count_no_event = []
percentage_no_event = []
for i in columns2:
    chi_2[i] = stats.chi2_contingency(pd.crosstab(adata['SSUM400'], adata[i]))
    # calculate the '1' percentage of the two groups
    percentage_event.append(adata[adata['SSUM400'] == 1][i].value_counts(normalize=True,sort=False))
    count_event.append(adata[adata['SSUM400'] == 1][i].value_counts(sort=False))
    percentage_no_event.append(adata[adata['SSUM400'] == 0][i].value_counts(normalize=True,sort=False))
    count_no_event.append(adata[adata['SSUM400'] == 0][i].value_counts(sort=False))

In [5]:
percentage_event = pd.DataFrame(percentage_event)
count_event = pd.DataFrame(count_event)

In [6]:
percentage_event.columns = ['0', '1']
count_event.columns = ['0', '1']
percentage_event.index = columns2
count_event.index = columns2

In [7]:
percentage_no_event = pd.DataFrame(percentage_no_event)
count_no_event = pd.DataFrame(count_no_event)

In [8]:
percentage_no_event.columns = ['0','1']
count_no_event.columns = ['0','1']
percentage_no_event.index = columns2
count_no_event.index = columns2

In [9]:
# combine the results of t-test to a df
t_test_df = pd.DataFrame(t_test).T
t_test_df.columns = ['t_statistic', 'p_value']
t_test_df['Q3_event'] = Q3_event
t_test_df['median_event'] = median_event
t_test_df['Q1_event'] = Q1_event
t_test_df['Q3_no_event'] = Q3_no_event
t_test_df['median_no_event'] = median_no_event
t_test_df['Q1_no_event'] = Q1_no_event

In [10]:
# combine the results of chi_2 test to a df
chi_2_df = pd.DataFrame(chi_2).T
chi_2_df.columns = ['chi_2_statistic', 'p_value','dof','expected']
# '1' is a column name,we need the '1' column to be the percentage of the event
chi_2_df['percentage_event_yes_or_classify3'] =  percentage_event["1"]
chi_2_df['count_event_yes_or_classify3'] = count_event["1"]
chi_2_df['percentage_event_no_or_classify2'] = percentage_event["0"]
chi_2_df['count_event_no_or_classify2'] = count_event["0"]
chi_2_df['percentage_no_event_yes_or_classify3'] = percentage_no_event["1"]
chi_2_df['count_no_event_yes_or_classify3'] = count_no_event["1"]
chi_2_df['percentage_no_event_no_or_classify2'] = percentage_no_event["0"]
chi_2_df['count_no_event_no_or_classify2'] = count_no_event["0"]

In [11]:
# save the results to a csv file
t_test_df.to_csv('/Users/gengzhi/Desktop/t_test.csv')
chi_2_df.to_csv('/Users/gengzhi/Desktop/chi2_test.csv')