In [9]:
# Created by: Chen Da
# Created on: 20191223

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
from sklearn.preprocessing import StandardScaler

In [11]:
# LR函数
def functionLR(id_data):
    z = intercept + np.dot(coefs, id_data.values[1:-1])
    prob = 1 / (1 + np.exp(-z))
    return prob

In [12]:
# 肺癌预测
def lungCancerPrediction(raw_df, file_name, thres=0.5):
    df = raw_df.loc[:, selected_columns] 
    prob_list = list()
    for info in df['id'].values:
        prob = functionLR(df[df['id'] == info].iloc[0, :])
        prob_list.append(prob)
    df_pro = df.loc[:, ['id', 'class']]
    df_pro['prob'] = pd.Series(prob_list)
    df_pro['class'] = df_pro['class'].apply(lambda x : str(x))
    df_pro['id'] = df_pro['id'].apply(lambda x : str(x)+'_')
    df_pro = df_pro.sort_values(by='class')
    fig = px.scatter(df_pro, x='id', y='prob',
                     color='class', template='ggplot2', color_discrete_sequence=['green', '#EE0000'],
                     color_discrete_map={'0': 'green', '1': '#EE0000'})
    fig.add_trace(go.Scatter(x=[df_pro['id'].iloc[0, ], df_pro['id'].iloc[-1,]], y=[thres, thres], 
                             mode='lines', 
                             line=dict(color='grey', dash='dash'),
                             showlegend=False))
    fig.update_layout(width=1200, height=800)
#     fig.to_image('C:/Users/pc/Desktop/output.png')
#     py.io.write_image(fig, '/Users/chenda/Desktop/%s' % file_name + '.png')
    fig.show()
    df_pro['pre_label'] = np.where(df_pro['prob'] >= thres, 1, 0)
    df_pro['class'] = df_pro['class'].apply(lambda x: int(x))
    pre_0_right = df_pro[np.logical_and(df_pro['class']==0, df_pro['pre_label']==0)].shape[0]
    pre_0_wrong = df_pro[np.logical_and(df_pro['class']==0, df_pro['pre_label']==1)].shape[0]
    pre_1_right = df_pro[np.logical_and(df_pro['class']==1, df_pro['pre_label']==1)].shape[0]
    pre_1_wrong = df_pro[np.logical_and(df_pro['class']==1, df_pro['pre_label']==0)].shape[0]
    print('总体准确率为%s' % ((pre_0_right+pre_1_right)/df_pro.shape[0]))
    print('混淆矩阵为%s' % [pre_0_right, pre_0_wrong, pre_1_right, pre_1_wrong])
    return df_pro

In [13]:
# file_name = input("请输入原始数据的名称：")
file_name = 'marker_model_data'
data_path = 'C:/Users/pc/OneDrive/PLTTECH/Project/20191205_lung_cancer/rawdata/%s' % file_name + '.xlsx'
df = pd.read_excel(data_path)
df.iloc[:, 1:-1] = df.iloc[:, 1:-1].multiply(100)
stdsc = StandardScaler()
df.iloc[:, 1:-1] = pd.DataFrame(stdsc.fit_transform(df.iloc[:, 1:-1].values), columns=df.columns[1:-1])

# 模型所用变量
selected_columns = ['id', 'CD3', 'CD4', 'CD8', 'CD45', 'CD27', 'CD39', 'CXCR5', 
                    'CD16', 'CD127', 'CD24', 'PD1', 'CD183', 'CD38', 'class']

# 模型系数
coefs = [0.6871717288003105, -0.22977702609083894, 0.2589973567797267, 0.8234324253434998, 0.35516130554381614, 
         -0.5490662472525447, -1.1656047229935123, 0.5777723052098703, 0.19956319505517695, -0.4032018954732623, 
         -0.1425112357109266, 0.33864177270327217, -0.42224913459863395]

intercept = -0.5063075718622478

plot_df = lungCancerPrediction(df, file_name)

总体准确率为0.9275862068965517
混淆矩阵为[177, 10, 92, 11]


In [14]:
# plot_df.to_excel('C:/Users/pc/Desktop/%s.xlsx'%file_name, index=False)

In [15]:
benign_new = pd.read_excel('C:/Users/pc/OneDrive/PLTTECH/Project/20191205_lung_cancer/rawdata/marker_benign.xlsx')
benign_new.iloc[:, 1:-1] = benign_new.iloc[:, 1:-1].multiply(100)
benign_new.iloc[:, 1:-1] = pd.DataFrame(stdsc.transform(benign_new.iloc[:, 1:-1].values), columns=benign_new.columns[1:-1])
benign_new = benign_new.loc[:, selected_columns]
benign_new_df = lungCancerPrediction(benign_new, 'benign_new')

总体准确率为1.0
混淆矩阵为[0, 0, 8, 0]


In [16]:
PLT_healthy_new = pd.read_excel('C:/Users/pc/OneDrive/PLTTECH/Project/20191205_lung_cancer/rawdata/marker_PLT_healthy.xlsx')
PLT_healthy_new.iloc[:, 1:-1] = PLT_healthy_new.iloc[:, 1:-1].multiply(100)
PLT_healthy_new.iloc[:, 1:-1] = pd.DataFrame(stdsc.transform(PLT_healthy_new.iloc[:, 1:-1].values), columns=PLT_healthy_new.columns[1:-1])
PLT_healthy_new = PLT_healthy_new.loc[:, selected_columns]
PLT_healthy_new_df = lungCancerPrediction(PLT_healthy_new, 'PLT_healthy_new')

总体准确率为0.875
混淆矩阵为[7, 1, 0, 0]


In [17]:
pulmonary_nodule = pd.read_excel('C:/Users/pc/OneDrive/PLTTECH/Project/20191205_lung_cancer/rawdata/marker_pulmonary_nodule.xlsx')
pulmonary_nodule.iloc[:, 1:-1] = pulmonary_nodule.iloc[:, 1:-1].multiply(100)
pulmonary_nodule.iloc[:, 1:-1] = pd.DataFrame(stdsc.transform(pulmonary_nodule.iloc[:, 1:-1].values), columns=pulmonary_nodule.columns[1:-1])
pulmonary_nodule = pulmonary_nodule.loc[:, selected_columns]
pulmonary_nodule_df = lungCancerPrediction(pulmonary_nodule, 'pulmonary_nodule')

总体准确率为0.6153846153846154
混淆矩阵为[8, 5, 0, 0]
