In [22]:
# Created by: Chen Da
# Created on: 20191031

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly as py
import plotly.graph_objs as go
import plotly.express as px

In [24]:
# LR函数
def functionLR(id_data):
    z = intercept + np.dot(coefs, id_data.values[1:-1])
    prob = 1 / (1 + np.exp(-z))
    return prob

In [25]:
# 肺癌预测
def lungCancerPrediction(raw_df, file_name):
    df = raw_df.loc[:, selected_columns] 
    prob_list = list()
    for info in df['id'].values:
        prob = functionLR(df[df['id'] == info].iloc[0, :])
        prob_list.append(prob)
    df_pro = df.loc[:, ['id', 'class']]
    df_pro['prob'] = pd.Series(prob_list)
#     df_healthy = df_pro[df_pro['class'] == 0]
#     df_unhealthy = df_pro[df_pro['class'] == 1]
#     plt.figure(figsize=(12,10))
#     plt.scatter(df_healthy['id'], df_healthy['prob'], color='red')
#     plt.scatter(df_unhealthy['id'], df_unhealthy['prob'], color='skyblue')
#     plt.axhline(y=0.5, color='grey', linestyle='--', alpha=0.6)
#     plt.xticks(rotation=90)
#     plt.grid()
#     plt.show()
    df_pro['class'] = df_pro['class'].apply(lambda x : str(x))
    df_pro['id'] = df_pro['id'].apply(lambda x : str(x)+'_')
    df_pro = df_pro.sort_values(by='class')
    fig = px.scatter(df_pro, x='id', y='prob',
                     color='class', template='ggplot2', color_discrete_sequence=['green', '#EE0000'],
                     color_discrete_map={'0': 'green', '1': '#EE0000'})
    fig.add_trace(go.Scatter(x=[df_pro['id'].iloc[0, ], df_pro['id'].iloc[-1,]], y=[0.5, 0.5], 
                             mode='lines', 
                             line=dict(color='grey', dash='dash'),
                             showlegend=False))
    fig.update_layout(width=1200, height=800)
#     fig.to_image('C:/Users/pc/Desktop/output.png')
#     py.io.write_image(fig, '/Users/chenda/Desktop/%s' % file_name + '.png')
    fig.show()
    df_pro['pre_label'] = df_pro['prob'].apply(lambda x: round(x))
    df_pro['class'] = df_pro['class'].apply(lambda x: int(x))
    pre_0_right = df_pro[np.logical_and(df_pro['class']==0, df_pro['pre_label']==0)].shape[0]
    pre_0_wrong = df_pro[np.logical_and(df_pro['class']==0, df_pro['pre_label']==1)].shape[0]
    pre_1_right = df_pro[np.logical_and(df_pro['class']==1, df_pro['pre_label']==1)].shape[0]
    pre_1_wrong = df_pro[np.logical_and(df_pro['class']==1, df_pro['pre_label']==0)].shape[0]
    print('总体准确率为%s' % ((pre_0_right+pre_1_right)/df_pro.shape[0]))
    print('混淆矩阵为%s' % [pre_0_right, pre_0_wrong, pre_1_right, pre_1_wrong])
    return df_pro

In [26]:
# file_name = input("请输入原始数据的名称：")
file_name = 'raw_data'
data_path = 'C:/Users/pc/OneDrive/PLTTECH/Project/20191205_lung_cancer/rawdata/%s' % file_name + '.xlsx'
df = pd.read_excel(data_path)
df.iloc[:, 1:-1] = df.iloc[:, 1:-1].multiply(100)
df.head()

# 模型所用变量
selected_columns = ['id', 'Lymphocytes/CD3+', 'Lymphocytes/CD3+/CD8+', 'Lymphocytes/CD3+/CD8+/Q2: 158Gd_CD197_CCR7+ , 155Gd_CD45RA+', 
                    'Lymphocytes/CD3-/B cells', 'Lymphocytes/CD3-/B cells/CD24+CD38+', 'Lymphocytes/CD3-/NK cells', 
                    'Monocytes', 'Myeloid cells/CD56-CD14-/DC cells/mDC', 'class']

# 模型系数
coefs = [-0.06017500752008219, -0.03648835670705105, -0.0001896681250317084, 0.023488732386436523, 
         0.03507360915289905, -0.04689464035150946, 0.05144025389377031, 0.05624861473318502]

intercept = -0.0014645371116683277

plot_df = lungCancerPrediction(df, file_name)

总体准确率为0.9565217391304348
混淆矩阵为[44, 2, 44, 2]


In [27]:
# plot_df.to_excel('C:/Users/pc/Desktop/%s.xlsx'%file_name, index=False)

In [35]:
healthy_new = pd.read_excel('C:/Users/pc/OneDrive/PLTTECH/Project/20191205_lung_cancer/rawdata/test_healthy_new.xlsx').loc[:, selected_columns]
healthy_new.iloc[:, 1:-1] = healthy_new.iloc[:, 1:-1].multiply(100)
healthy_new_df = lungCancerPrediction(healthy_new, 'healthy_new')

总体准确率为0.84472049689441
混淆矩阵为[136, 25, 0, 0]


In [36]:
lung_cancer_new = pd.read_excel('C:/Users/pc/OneDrive/PLTTECH/Project/20191205_lung_cancer/rawdata/test_lung_cancer_new.xlsx').loc[:, selected_columns]
lung_cancer_new.iloc[:, 1:-1] = lung_cancer_new.iloc[:, 1:-1].multiply(100)
lung_cancer_new_df = lungCancerPrediction(lung_cancer_new, 'lung_cancer_new')

总体准确率为0.8484848484848485
混淆矩阵为[0, 0, 28, 5]


In [38]:
benign_new = pd.read_excel('C:/Users/pc/OneDrive/PLTTECH/Project/20191205_lung_cancer/rawdata/test_benign.xlsx').loc[:, selected_columns]
benign_new.iloc[:, 1:-1] = benign_new.iloc[:, 1:-1].multiply(100)
benign_new_df = lungCancerPrediction(benign_new, 'benign_new')

总体准确率为1.0
混淆矩阵为[0, 0, 8, 0]


In [39]:
PLT_healthy_new = pd.read_excel('C:/Users/pc/OneDrive/PLTTECH/Project/20191205_lung_cancer/rawdata/test_PLT_healthy.xlsx').loc[:, selected_columns]
PLT_healthy_new.iloc[:, 1:-1] = PLT_healthy_new.iloc[:, 1:-1].multiply(100)
PLT_healthy_new_df = lungCancerPrediction(PLT_healthy_new, 'PLT_healthy_new')

总体准确率为0.8333333333333334
混淆矩阵为[5, 1, 0, 0]


In [40]:
pulmonary_nodule = pd.read_excel('C:/Users/pc/OneDrive/PLTTECH/Project/20191205_lung_cancer/rawdata/test_pulmonary_nodule.xlsx').loc[:, selected_columns]
pulmonary_nodule.iloc[:, 1:-1] = pulmonary_nodule.iloc[:, 1:-1].multiply(100)
pulmonary_nodule_df = lungCancerPrediction(pulmonary_nodule, 'pulmonary_nodule')

总体准确率为0.38461538461538464
混淆矩阵为[5, 8, 0, 0]
