In [6]:
# Created by: Chen Da
# Created on: 201912

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
from sklearn.preprocessing import StandardScaler

In [8]:
# LR函数
def functionLR(id_data):
    z = intercept + np.dot(coefs, id_data.values[1:-1])
    prob = 1 / (1 + np.exp(-z))
    return prob

In [9]:
# 肺癌预测
def lungCancerPrediction(raw_df, file_name, thres=0.5):
    df = raw_df.loc[:, selected_columns] 
    prob_list = list()
    for info in df['id'].values:
        prob = functionLR(df[df['id'] == info].iloc[0, :])
        prob_list.append(prob)
    df_pro = df.loc[:, ['id', 'class']]
    df_pro['prob'] = pd.Series(prob_list)
    df_pro['class'] = df_pro['class'].apply(lambda x : str(x))
    df_pro['id'] = df_pro['id'].apply(lambda x : str(x)+'_')
    df_pro = df_pro.sort_values(by='class')
    fig = px.scatter(df_pro, x='id', y='prob',
                     color='class', template='ggplot2', color_discrete_sequence=['green', '#EE0000'],
                     color_discrete_map={'0': 'green', '1': '#EE0000'})
    fig.add_trace(go.Scatter(x=[df_pro['id'].iloc[0, ], df_pro['id'].iloc[-1,]], y=[thres, thres], 
                             mode='lines', 
                             line=dict(color='grey', dash='dash'),
                             showlegend=False))
    fig.update_layout(width=1200, height=800)
#     fig.to_image('C:/Users/pc/Desktop/output.png')
#     py.io.write_image(fig, '/Users/chenda/Desktop/%s' % file_name + '.png')
    fig.show()
    df_pro['pre_label'] = np.where(df_pro['prob'] >= thres, 1, 0)
    df_pro['class'] = df_pro['class'].apply(lambda x: int(x))
    pre_0_right = df_pro[np.logical_and(df_pro['class']==0, df_pro['pre_label']==0)].shape[0]
    pre_0_wrong = df_pro[np.logical_and(df_pro['class']==0, df_pro['pre_label']==1)].shape[0]
    pre_1_right = df_pro[np.logical_and(df_pro['class']==1, df_pro['pre_label']==1)].shape[0]
    pre_1_wrong = df_pro[np.logical_and(df_pro['class']==1, df_pro['pre_label']==0)].shape[0]
    print('总体准确率为%s' % ((pre_0_right+pre_1_right)/df_pro.shape[0]))
    print('混淆矩阵为%s' % [pre_0_right, pre_0_wrong, pre_1_right, pre_1_wrong])
    return df_pro

In [11]:
path = 'C:/Users/pc/OneDrive/PLTTECH/Project/02_Disease_early_screening/lung_cancer/'
df_name = 'subsets_all'
df = pd.read_excel(path + 'rawdata/' + df_name + '.xlsx')

df['id'] = df['id'].apply(lambda x:str(x))
pnasSubsetsIndex = [0,1,2,8,9,15,16,20,22,23,30,35,39,45,46,47,48,51,53,54,57,61,62,63,64,65,66,
                    67,70,71,72,74,76,77,82,83,84,85,86,89,90,91,93,94,96,97,99]       
df = df.iloc[:, pnasSubsetsIndex]
df.iloc[:, 1:-1] = df.iloc[:, 1:-1].multiply(100)
stdsc = StandardScaler()
df.iloc[:, 1:-1] = pd.DataFrame(stdsc.fit_transform(df.iloc[:, 1:-1].values), columns=df.columns[1:-1])

# 模型所用变量
#selected_columns = ['id', 'Lymphocytes', 'Lymphocytes/CD3+', 'Lymphocytes/CD3+/CD8+/Q8: 176Yb_HLA_DR- , 172Yb_CD38-', 
#                    'Lymphocytes/CD3-', 'Lymphocytes/CD3-/NK cells', 'Lymphocytes/NKT', 'Myeloid cells/CD56-CD14-/DC cells/mDC', 
#                    'Myeloid cells/HLA-DR-/MDSC', 'class']
selected_columns = ['id', 'Lymphocytes', 'Lymphocytes/CD3+', 
                    'Lymphocytes/CD3-', 'Lymphocytes/CD3-/NK cells', 'Lymphocytes/NKT', 'Myeloid cells/CD56-CD14-/DC cells/mDC', 
                    'Myeloid cells/HLA-DR-/MDSC', 'class']

# 模型系数
#coefs = [-0.22245426758483827, -0.020730427625031522, -0.16866938308919024, 0.1687315981698945, -0.31750040295546617, 
#         -0.01889006712989188, 0.041521475709957505, 0.0017547035254121604]
coefs = [-0.22245426758483827, -0.020730427625031522, 0.1687315981698945, -0.31750040295546617, 
         -0.01889006712989188, 0.041521475709957505, 0.0017547035254121604]

intercept = 0

plot_df = lungCancerPrediction(df, df_name)

总体准确率为0.8035087719298246
混淆矩阵为[137, 49, 92, 7]


In [12]:
test_all = pd.read_excel(path + 'rawdata/lung_subsets_test.xlsx')
test_all = test_all.iloc[:, pnasSubsetsIndex]
test_all.iloc[:, 1:-1] = test_all.iloc[:, 1:-1].multiply(100)
test_all.iloc[:, 1:-1] = pd.DataFrame(stdsc.transform(test_all.iloc[:, 1:-1].values), columns=test_all.columns[1:-1])
test_all = test_all.loc[:, selected_columns]
new_data_df = lungCancerPrediction(test_all, 'new_lung')

总体准确率为0.8153846153846154
混淆矩阵为[0, 0, 53, 12]


In [8]:
benign_new = pd.read_excel('C:/Users/pc/OneDrive/PLTTECH/Project/20191205_lung_cancer/rawdata/test_benign.xlsx')
benign_new.iloc[:, 1:-1] = benign_new.iloc[:, 1:-1].multiply(100)
benign_new.iloc[:, 1:-1] = pd.DataFrame(stdsc.transform(benign_new.iloc[:, 1:-1].values), columns=benign_new.columns[1:-1])
benign_new = benign_new.loc[:, selected_columns]
benign_new_df = lungCancerPrediction(benign_new, 'benign_new')

总体准确率为1.0
混淆矩阵为[0, 0, 8, 0]


In [9]:
PLT_healthy_new = pd.read_excel('C:/Users/pc/OneDrive/PLTTECH/Project/20191205_lung_cancer/rawdata/test_PLT_healthy.xlsx')
PLT_healthy_new.iloc[:, 1:-1] = PLT_healthy_new.iloc[:, 1:-1].multiply(100)
PLT_healthy_new.iloc[:, 1:-1] = pd.DataFrame(stdsc.transform(PLT_healthy_new.iloc[:, 1:-1].values), columns=PLT_healthy_new.columns[1:-1])
PLT_healthy_new = PLT_healthy_new.loc[:, selected_columns]
PLT_healthy_new_df = lungCancerPrediction(PLT_healthy_new, 'PLT_healthy_new')

总体准确率为0.375
混淆矩阵为[3, 5, 0, 0]


In [10]:
pulmonary_nodule = pd.read_excel('C:/Users/pc/OneDrive/PLTTECH/Project/20191205_lung_cancer/rawdata/test_pulmonary_nodule.xlsx')
pulmonary_nodule.iloc[:, 1:-1] = pulmonary_nodule.iloc[:, 1:-1].multiply(100)
pulmonary_nodule.iloc[:, 1:-1] = pd.DataFrame(stdsc.transform(pulmonary_nodule.iloc[:, 1:-1].values), columns=pulmonary_nodule.columns[1:-1])
pulmonary_nodule = pulmonary_nodule.loc[:, selected_columns]
pulmonary_nodule_df = lungCancerPrediction(pulmonary_nodule, 'pulmonary_nodule')

总体准确率为0.21428571428571427
混淆矩阵为[3, 11, 0, 0]
