In [6]:
import pandas as pd
from pprint import pprint
import math

In [2]:
path = './data/data_with_time_series.xlsx'
df = pd.read_excel(path)
df.shape

(3629, 30)

In [3]:
df_ori = df.filter(['手术ID','病区','实施手术_分词','术前诊断_分词']).copy()
df_ori.shape

(3629, 4)

In [4]:
# 返回以列表形式表示的分词结果
def word_list(s):
    return s.split('|')


def word_dictionary(df,col):   
    ans = set()
    def handle(s):
        for w in word_list(s):
            ans.add(w)
    df[col].apply(handle)
    print('获得单词:'+str(len(ans)))
    return ans

def get_departments(df):
    return list(set(df['病区']))

In [23]:
# 计算每一个单词的互信息值
def MI(df,word,department,word_col_name):
    N = df.shape[0]
    A = df[(df['病区'] == department) & (df[word_col_name].str.contains(word))].shape[0]
    B = df[~(df['病区'] == department) & (df[word_col_name].str.contains(word))].shape[0]
    C = df[(df['病区'] == department) & ~(df[word_col_name].str.contains(word))].shape[0]
    if A == 0:
        return 0
    numerator = N*A
    denominator = (A+B)*(A+C)
    return round(math.log2(numerator / denominator),4)

# 计算feature_matrix矩阵
def get_feature_matrix(df,departments,words,word_col_name):
    feature_matrix = pd.DataFrame(index=words,columns=departments,dtype='float')
    count = 1
    for w in words:
        if count % 100 == 0:
            print('进度:' + str(count))
        row = []
        for c in departments:
            row.append(MI(df,w,c,word_col_name))
        feature_matrix.loc[w] = row
        count += 1
    feature_matrix['MI_MAX'] = feature_matrix.max(axis=1)
    feature_matrix.sort_values('MI_MAX',ascending=False,inplace=True)
    return feature_matrix

In [7]:
cols = get_departments(df_ori)

In [9]:
dictionary_diagnose = word_dictionary(df,'术前诊断_分词')

获得单词:1690


In [17]:
MI(df_ori,'妊娠','妇产科病区','术前诊断_分词')

2.589

In [18]:
mi_feature_matrix_diagnose = get_feature_matrix(df_ori,cols,dictionary_diagnose,'术前诊断_分词')

进度:100
进度:200
进度:300
进度:400
进度:500
进度:600
进度:700
进度:800
进度:900
进度:1000
进度:1100
进度:1200
进度:1300
进度:1400
进度:1500
进度:1600


In [20]:
mi_feature_matrix_diagnose.tail()

Unnamed: 0,泌尿外科中心病区,耳鼻咽喉科病区,肝胆外科病区,神经内科病区,妇产科病区,整形外科病区,骨科病区,眼科病区,心血管病区,普通外科病区,普通胸外科病区,神经外科病区,口腔科病区,CHI_MAX
术后,0.6615,-0.8712,-0.6714,-1.981,-0.7645,1.0696,0.6599,0.4847,-1.8195,0.2691,-1.138,0.1565,-0.0299,1.0696
左,0.858,0.2443,-3.1998,-0.8655,-2.649,-0.1368,0.3504,0.8632,-0.7889,-0.6378,0.852,0.5781,0.986,0.986
综合征,0.118,-0.5078,0.0,0.0,0.8213,0.0,0.5804,0.0,0.3215,-1.0679,0.0,-0.9395,0.0,0.8213
征,0.0591,-0.5667,0.0,0.0,0.7624,0.0,0.6914,0.0,0.2626,-1.1268,0.0,-0.9984,0.0,0.7624
L,0.118,0.0,0.0,0.0,0.5989,0.0,0.7503,0.0,0.3215,0.517,0.0,0.0,0.0,0.7503


In [21]:
dictionary_operation = word_dictionary(df_ori,'实施手术_分词')

获得单词:1343


In [28]:
mi_feature_matrix_operation = get_feature_matrix(df_ori,cols,dictionary_operation,'实施手术_分词')

进度:100
进度:200
进度:300
进度:400
进度:500
进度:600
进度:700
进度:800
进度:900
进度:1000
进度:1100
进度:1200
进度:1300


In [29]:
mi_feature_matrix_operation.tail()

Unnamed: 0,泌尿外科中心病区,耳鼻咽喉科病区,肝胆外科病区,神经内科病区,妇产科病区,整形外科病区,骨科病区,眼科病区,心血管病区,普通外科病区,普通胸外科病区,神经外科病区,口腔科病区,MI_MAX
左,0.8317,0.1838,-1.3647,0.4551,-1.5113,0.1134,0.212,1.2833,-0.7127,-0.5547,0.9932,-0.1507,0.8718,1.2833
切,0.3631,0.3492,1.2535,0.0,-0.0073,-1.4648,-0.523,-0.4648,-3.5763,0.0115,0.4291,0.0815,0.828,1.2535
右侧,0.7454,0.8041,0.0,1.2469,-0.411,-0.7469,0.2334,0.0,0.0,0.659,0.0,0.0505,1.1536,1.2469
全,-2.1087,-4.3194,-3.4416,0.3781,1.1565,-0.9636,0.6016,0.0,-2.2677,-0.0722,-4.1712,0.6412,-3.0631,1.1565
左侧,1.0667,0.9173,0.0,0.6149,-0.391,-0.7268,0.0279,0.0,-4.8383,0.4864,-3.9344,0.2405,0.7586,1.0667


In [26]:
mi_feature_matrix_diagnose.to_excel('./feature_matrix/mi_feature_diagnose.xlsx')

In [30]:
mi_feature_matrix_operation.to_excel('./feature_matrix/mi_feature_operation.xlsx')