In [1]:
# Created by: Chen Da
# Created on: 20200115

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
from sklearn.preprocessing import StandardScaler

In [3]:
# LR函数
def functionLR(id_data):
    z = intercept + np.dot(coefs, id_data.values[1:-1])
    prob = 1 / (1 + np.exp(-z))
    return prob

In [4]:
# 肺癌预测
def lungCancerPrediction(raw_df, file_name, thres=0.5):
    df = raw_df.loc[:, selected_columns] 
    prob_list = list()
    for info in df['id'].values:
        prob = functionLR(df[df['id'] == info].iloc[0, :])
        prob_list.append(prob)
    df_pro = df.loc[:, ['id', 'class']]
    df_pro['prob'] = pd.Series(prob_list)
    df_pro['class'] = df_pro['class'].apply(lambda x : str(x))
    df_pro['id'] = df_pro['id'].apply(lambda x : str(x)+'_')
    df_pro = df_pro.sort_values(by='class')
    fig = px.scatter(df_pro, x='id', y='prob',
                     color='class', template='ggplot2', color_discrete_sequence=['green', '#EE0000'],
                     color_discrete_map={'0': 'green', '1': '#EE0000'})
    fig.add_trace(go.Scatter(x=[df_pro['id'].iloc[0, ], df_pro['id'].iloc[-1,]], y=[thres, thres], 
                             mode='lines', 
                             line=dict(color='grey', dash='dash'),
                             showlegend=False))
    fig.update_layout(width=1200, height=800)
#     fig.to_image('C:/Users/pc/Desktop/output.png')
#     py.io.write_image(fig, '/Users/chenda/Desktop/%s' % file_name + '.png')
    fig.show()
    df_pro['pre_label'] = np.where(df_pro['prob'] >= thres, 1, 0)
    df_pro['class'] = df_pro['class'].apply(lambda x: int(x))
    pre_0_right = df_pro[np.logical_and(df_pro['class']==0, df_pro['pre_label']==0)].shape[0]
    pre_0_wrong = df_pro[np.logical_and(df_pro['class']==0, df_pro['pre_label']==1)].shape[0]
    pre_1_right = df_pro[np.logical_and(df_pro['class']==1, df_pro['pre_label']==1)].shape[0]
    pre_1_wrong = df_pro[np.logical_and(df_pro['class']==1, df_pro['pre_label']==0)].shape[0]
    print('总体准确率为%s' % ((pre_0_right+pre_1_right)/df_pro.shape[0]))
    print('混淆矩阵为%s' % [pre_0_right, pre_0_wrong, pre_1_right, pre_1_wrong])
    return df_pro

In [8]:
# file_name = input("请输入原始数据的名称：")
file_name = 'marker_all'
data_path = 'C:/Users/pc/OneDrive/PLTTECH/Project/02_Disease_early_screening/lung_cancer/rawdata/%s' % file_name + '.xlsx'
# data_path = '/Users/chenda/OneDrive/PLTTECH/Project/20191205_lung_cancer/rawdata/%s' % file_name + '.xlsx'

selected_marker = ['id', 'CD3', 'CD4', 'CD57', 'CD56', 'gdTCR', 'CD8', 'CD14', 'Igd',
                   'CD123', 'CD19', 'CD25', 'CD39', 'CD27', 'CD24', 'CD45RA',
                   'CD86', 'CD28', 'CD11c', 'CD33', 'CD152', 'CD161', 'CXCR5',
                   'CD183', 'CD94', 'CD127', 'PD1', 'CD20', 'HLA_DR', 'CD11b',
                   'CCR6', 'CD38', 'CD274', 'CD278', 'class']    

df = pd.read_excel(data_path)
df = df.loc[:, selected_marker]
df.iloc[:, 1:-1] = df.iloc[:, 1:-1].multiply(100)
stdsc = StandardScaler()
df.iloc[:, 1:-1] = pd.DataFrame(stdsc.fit_transform(df.iloc[:, 1:-1].values), columns=df.columns[1:-1])

# 模型所用变量
selected_columns = ['id', 'CD4', 'CD57', 'gdTCR', 'CD8', 'CD14', 'CD25', 'CD11c', 'CD152', 'CXCR5', 
                    'CD183', 'CD94', 'CD127', 'PD1', 'CD11b', 'CCR6', 'CD274', 'CD278', 'class']

# 模型系数
coefs = [-0.8459815168219227, -0.36555767856494337, 0.14164486230833692, 0.7707156368877089, -1.1315770767373436, 
         -0.8529934591964614, 0.7270000458980042, 0.16606039823022928, -1.124911435229016, 1.5018399654147898, 
         0.6464951495205371, 1.0347172545909542, -0.47989625430389077, -0.35030070023958215, 0.5276478176403758, 
         -1.1135477519942079, 0.991783930303216]

intercept = -1.0975572222455543

plot_df = lungCancerPrediction(df, file_name)

总体准确率为0.9276729559748428
混淆矩阵为[191, 15, 104, 8]


In [9]:
# plot_df.to_excel('C:/Users/pc/Desktop/%s.xlsx'%file_name, index=False)

In [10]:
new_data = pd.read_excel('C:/Users/pc/OneDrive/PLTTECH/Project/02_Disease_early_screening/lung_cancer/rawdata/lung_marker_113_141.xlsx')
new_data = new_data.loc[:, selected_marker]
new_data.iloc[:, 1:-1] = new_data.iloc[:, 1:-1].multiply(100)
new_data.iloc[:, 1:-1] = pd.DataFrame(stdsc.transform(new_data.iloc[:, 1:-1].values), columns=new_data.columns[1:-1])
new_data = new_data.loc[:, selected_columns]
new_data_df = lungCancerPrediction(new_data, 'new_data')

总体准确率为0.8695652173913043
混淆矩阵为[0, 0, 20, 3]


In [11]:
# benign_new = pd.read_excel('C:/Users/pc/OneDrive/PLTTECH/Project/20191205_lung_cancer/rawdata/test_benign.xlsx')
# benign_new.iloc[:, 1:-1] = benign_new.iloc[:, 1:-1].multiply(100)
# benign_new.iloc[:, 1:-1] = pd.DataFrame(stdsc.transform(benign_new.iloc[:, 1:-1].values), columns=benign_new.columns[1:-1])
# benign_new = benign_new.loc[:, selected_columns]
# benign_new_df = lungCancerPrediction(benign_new, 'benign_new')

In [13]:
PLT_healthy_new = pd.read_excel('C:/Users/pc/OneDrive/PLTTECH/Project/02_Disease_early_screening/lung_cancer/rawdata/PLT_healthy_marker.xlsx')
PLT_healthy_new = PLT_healthy_new.loc[:, selected_marker]
PLT_healthy_new.iloc[:, 1:-1] = PLT_healthy_new.iloc[:, 1:-1].multiply(100)
PLT_healthy_new.iloc[:, 1:-1] = pd.DataFrame(stdsc.transform(PLT_healthy_new.iloc[:, 1:-1].values), columns=PLT_healthy_new.columns[1:-1])
PLT_healthy_new = PLT_healthy_new.loc[:, selected_columns]
# PLT_healthy_new_df = lungCancerPrediction(PLT_healthy_new, 'PLT_healthy_new')

In [14]:
pulmonary_nodule = pd.read_excel('C:/Users/pc/OneDrive/PLTTECH/Project/02_Disease_early_screening/lung_cancer/test_pulmonary_nodule.xlsx')
pulmonary_nodule = pulmonary_nodule.loc[:, selected_marker]
pulmonary_nodule.iloc[:, 1:-1] = pulmonary_nodule.iloc[:, 1:-1].multiply(100)
pulmonary_nodule.iloc[:, 1:-1] = pd.DataFrame(stdsc.transform(pulmonary_nodule.iloc[:, 1:-1].values), columns=pulmonary_nodule.columns[1:-1])
pulmonary_nodule = pulmonary_nodule.loc[:, selected_columns]
pulmonary_nodule_df = lungCancerPrediction(pulmonary_nodule, 'pulmonary_nodule')