In [1]:
import pandas as pd
from pprint import pprint
import math

In [2]:
path = './data/data_with_time_series.xlsx'
df = pd.read_excel(path)
df.shape

(3629, 30)

In [3]:
df_ori = df.filter(['手术ID','病区','实施手术_分词','术前诊断_分词']).copy()
df_ori.shape

(3629, 4)

In [4]:
# 返回以列表形式表示的分词结果
def word_list(s):
    return s.split('|')


def word_dictionary(df,col):   
    ans = set()
    def handle(s):
        for w in word_list(s):
            ans.add(w)
    df[col].apply(handle)
    print('获得单词:'+str(len(ans)))
    return ans

def get_departments(df):
    return list(set(df['病区']))

In [50]:
# 输入一个事件的发生概率列表，计算它的信息熵
def entropy(p_list):
    ans = 0.0
    for p in p_list:
        ans += (-1) * p * math.log2(p)
    return ans

# 计算每个科室出现的概率
def get_department_prob(df):
    prob = []
    total = df.shape[0]
    for department in get_departments(df):
        prob.append(df[df['病区'] == department].shape[0] / total)
    return prob

# 
def IG(word,entropy_C,df,departments,word_col_name):
    N = df.shape[0]
    entropy_C_w = 0.0
    for department in departments:
        A = df[(df[word_col_name].str.contains(word)) & (df['病区'] == department)].shape[0]
        B = df[(df[word_col_name].str.contains(word)) & ~(df['病区'] == department)].shape[0]
        C = df[~(df[word_col_name].str.contains(word)) & (df['病区'] == department)].shape[0]
        D = df[~(df[word_col_name].str.contains(word)) & ~(df['病区'] == department)].shape[0]
        left = 0.0
        right = 0.0
        if A != 0:
            left = -(A / N) * math.log2(A / (A + B))
        if C != 0:
            right = - (C / N) * math.log2(C / (C + D))
        entropy_C_w +=  left + right
    return entropy_C - entropy_C_w



def get_feature_matrix(words,entropy_C,df,departments,word_col_name):
    feature_matrix = pd.DataFrame(index=words,columns=['IG'],dtype='float')
    count = 1
    for w in words:
        if count % 100 == 0:
            print('进度:'+str(count))
        feature_matrix.loc[w] = IG(w,entropy_C,df,departments,word_col_name)
        count += 1
    feature_matrix.sort_values('IG',ascending=False,inplace=True)
    return feature_matrix

In [35]:
entropy_C = entropy(get_department_prob(df_ori))

In [36]:
entropy_C

3.2377408948578252

In [38]:
departments = get_departments(df_ori)

In [46]:
IG('L',entropy_C,df_ori,departments,'术前诊断_分词')

0.0012156303873722685

In [48]:
word_diagnose = word_dictionary(df_ori,'术前诊断_分词')

获得单词:1690


In [51]:
ig_feature_matrix_diagnose = get_feature_matrix(word_diagnose,entropy_C,df_ori,departments,'术前诊断_分词')

进度:100
进度:200
进度:300
进度:400
进度:500
进度:600
进度:700
进度:800
进度:900
进度:1000
进度:1100
进度:1200
进度:1300
进度:1400
进度:1500
进度:1600


In [52]:
ig_feature_matrix_diagnose.head()

Unnamed: 0,IG
宫,0.303034
椎,0.285789
腰,0.241416
心,0.240599
结石,0.24024


In [53]:
word_operation = word_dictionary(df_ori,'实施手术_分词')

获得单词:1343


In [54]:
ig_feature_matrix_operation = get_feature_matrix(word_operation,entropy_C,df_ori,departments,'实施手术_分词')

进度:100
进度:200
进度:300
进度:400
进度:500
进度:600
进度:700
进度:800
进度:900
进度:1000
进度:1100
进度:1200
进度:1300


In [55]:
ig_feature_matrix_operation

Unnamed: 0,IG
腔,0.330629
腹,0.328331
镜,0.320010
腹腔镜,0.289260
腹腔,0.286722
...,...
管髓,0.000597
PLIF术,0.000597
网植骨,0.000597
左先,0.000597


In [56]:
ig_feature_matrix_diagnose.to_excel('./feature_matrix/ig_feature_diagnose.xlsx')

In [57]:
ig_feature_matrix_operation.to_excel('./feature_matrix/ig_feature_operation.xlsx')