In [57]:
import pandas as pd
from pprint import pprint
import re

In [3]:
path = './data/data_with_segment.xlsx'
df = pd.read_excel(path)
df.shape

(3629, 30)

In [7]:
def read_feature_matrix(method,category):
    df = pd.read_excel('./feature_matrix/{}_feature_{}.xlsx'.format(method,category),index_col=0)
    print(df.shape)
    return df

In [10]:
chi_feature_diagnose = read_feature_matrix('chi','diagnose')
chi_feature_operation = read_feature_matrix('chi','operation')
ig_feature_diagnose = read_feature_matrix('ig','diagnose')
ig_feature_operation = read_feature_matrix('ig','operation')
mi_feature_diagnose = read_feature_matrix('mi','diagnose')
mi_feature_operation = read_feature_matrix('mi','operation')

(1690, 14)
(1343, 14)
(1690, 1)
(1343, 1)
(1690, 14)
(1343, 14)


In [37]:
def word_list(s):
    return s.split('|')

def get_department_top_feature(df,department,col_name,feature_matrix,feature_matrix_col):
    df_department = df.loc[df['病区'] == department]
    word_sets = set()

    word_val = {}
    def handle(s):
        for w in word_list(s):
            word_sets.add(w)
    df_department[col_name].apply(handle)
    print('单词数目:' + str(len(word_sets)))
    for w in word_sets:
        word_val[w] = feature_matrix.loc[w,feature_matrix_col]
    
    # 取top10
    word_val_top10 = sorted(word_val.items(),key=lambda x:x[1],reverse=True)[0:14]
    return word_val_top10

In [68]:
word_val_diagnose_top10 = get_department_top_feature(df,'骨科病区','术前诊断_分词',ig_feature_diagnose,'IG')
pprint(word_val_diagnose_top10)

单词数目:439
[('椎', 0.2857892864482974),
 ('腰', 0.2414159259706277),
 ('腰椎', 0.2050822239529904),
 ('肺', 0.1754219573510967),
 ('骨', 0.1719492582150037),
 ('肾', 0.1559245102301454),
 ('病', 0.1366411161603884),
 ('动脉', 0.1361115279763081),
 ('炎', 0.1278894667390431),
 ('慢性', 0.1252601593639895),
 ('椎间盘', 0.101881649962261),
 ('症', 0.09721993524021944),
 ('性', 0.09455942723679511),
 ('盘', 0.09447569650328891)]


In [48]:
word_val_operation_top10 = get_department_top_feature(df,'骨科病区','实施手术_分词',ig_feature_operation,'IG')
pprint(word_val_operation_top10)

单词数目:413
[('腔', 0.3306290030923242),
 ('镜', 0.3200104826362615),
 ('胸', 0.2554618989040649),
 ('下', 0.1896141336704238),
 ('置换', 0.1890431834349737),
 ('置换术', 0.1808097155844388),
 ('关节', 0.1760090260390794),
 ('除术', 0.1714929983433637),
 ('切除术', 0.1697775165395541),
 ('切除', 0.164219936614733),
 ('根', 0.1611314917658828),
 ('膜', 0.1550412507450742),
 ('人工', 0.1507160332836333),
 ('椎', 0.1498002418132356)]


In [71]:
def word_dictionary(df,col):   
    ans = set()
    def handle(s):
        for w in word_list(s):
            ans.add(w)
    df[col].apply(handle)
    print('获得单词:'+str(len(ans)))
    return ans

# 适用BIS估计手术时间
def get_operation_time(df):
    df['手术时间'] = df['BIS'].apply(lambda s:len(re.split(r'\s+',str(s))) * 30 / 3600)
    
# 找到每个特征选择结果中的前1000个词
def get_topK(feature_matrix,K):
    return set(feature_matrix.index[0:K])

In [23]:
dictionary = word_dictionary(df,'实施手术_分词')

获得单词:1343


In [58]:
get_operation_time(df)

In [59]:
cols = ['手术ID','姓名','性别','年龄','体重','身高','病区','实施手术','实施手术_分词','术前诊断','术前诊断_分词','监测ID','手术时间']
df_ori = df.filter(cols).copy()
df_ori.shape

(3629, 13)

In [60]:
df_ori.head()

Unnamed: 0,手术ID,姓名,性别,年龄,体重,身高,病区,实施手术,实施手术_分词,术前诊断,术前诊断_分词,监测ID,手术时间
0,408485,邓煊雅,女,36,66.0,162.0,妇产科病区,二次剖宫产术,二次|剖宫产术,1.妊娠38+2周孕6产1晚孕、 2.瘢痕子宫、 3.右侧附件囊肿；,妊娠|周孕|产|晚孕|瘢痕|子宫|右侧|附件|囊肿,252525,1.808333
1,408486,刘书,女,51,58.0,160.0,心血管病区,房颤消融改良迷宫术+人工二尖瓣置换术,房颤|消融|改良|迷宫术|人工|二尖瓣|置换术,风心病,风心病,252556,4.758333
2,408487,杨仁秀,女,66,47.0,144.5,心血管病区,再次二尖瓣置换术,二尖瓣|置换术,心脏瓣膜置换术后 瓣膜功能障碍,心脏|瓣膜|置换术|后|瓣膜|功能障碍,252599,6.65
3,408491,山永新,男,47,102.0,171.0,骨科病区,经皮椎间孔镜下腰5骶1椎间盘髓核切除术,经皮椎间|孔镜|下腰|骶|椎间盘|髓|核|切除术,腰椎管狭窄症,腰椎|狭窄|症,252538,3.958333
4,408494,李上均,男,46,55.0,173.0,骨科病区,腰4/5MIS-TLIF术,腰|MIS-TLIF术,腰椎间盘突出,腰椎间盘,252616,2.875


In [72]:
chi_feature_diagnose_top1000 = get_topK(chi_feature_diagnose,1000)
chi_feature_operation_top1000 = get_topK(chi_feature_operation,1000)
mi_feature_diagnose_top1000 = get_topK(mi_feature_diagnose,1000)
mi_feature_operation_top1000 = get_topK(mi_feature_operation,1000)
ig_feature_diagnose_top1000 = get_topK(ig_feature_diagnose,1000)
ig_feature_operation_top1000 = get_topK(ig_feature_operation,1000)

In [76]:
# 生成特征选择后的6列
def generate_cols(df):
    def handle(s,feature_matrix):
        ans = []
        for w in s.split('|'):
            if w in feature_matrix:
                ans.append(w)
        return '|'.join(ans)
    
    df['实施手术_CHI'] = df['实施手术_分词'].apply(handle,feature_matrix=chi_feature_operation_top1000)
    df['术前诊断_CHI'] = df['术前诊断_分词'].apply(handle,feature_matrix=chi_feature_diagnose_top1000)
    df['实施手术_MI'] = df['实施手术_分词'].apply(handle,feature_matrix=mi_feature_operation_top1000)
    df['术前诊断_MI'] = df['术前诊断_分词'].apply(handle,feature_matrix=mi_feature_diagnose_top1000)
    df['实施手术_IG'] = df['实施手术_分词'].apply(handle,feature_matrix=ig_feature_operation_top1000)
    df['术前诊断_IG'] = df['术前诊断_分词'].apply(handle,feature_matrix=ig_feature_diagnose_top1000)

In [77]:
generate_cols(df_ori)

In [81]:
print(df_ori[df_ori['实施手术_CHI'] != df_ori['实施手术_分词']].shape)
print(df_ori[df_ori['实施手术_IG'] != df_ori['实施手术_分词']].shape)
print(df_ori[df_ori['实施手术_MI'] != df_ori['实施手术_分词']].shape)

(431, 19)
(313, 19)
(2922, 19)


In [82]:
df_ori[df_ori['实施手术_MI'] != df_ori['实施手术_分词']]

Unnamed: 0,手术ID,姓名,性别,年龄,体重,身高,病区,实施手术,实施手术_分词,术前诊断,术前诊断_分词,监测ID,手术时间,实施手术_CHI,术前诊断_CHI,实施手术_MI,术前诊断_MI,实施手术_IG,术前诊断_IG
3,408491,山永新,男,47,102.0,171.0,骨科病区,经皮椎间孔镜下腰5骶1椎间盘髓核切除术,经皮椎间|孔镜|下腰|骶|椎间盘|髓|核|切除术,腰椎管狭窄症,腰椎|狭窄|症,252538,3.958333,经皮椎间|孔镜|下腰|骶|椎间盘|髓|核|切除术,腰椎|狭窄|症,,,经皮椎间|孔镜|下腰|骶|椎间盘|髓|核|切除术,腰椎|狭窄|症
4,408494,李上均,男,46,55.0,173.0,骨科病区,腰4/5MIS-TLIF术,腰|MIS-TLIF术,腰椎间盘突出,腰椎间盘,252616,2.875000,腰|MIS-TLIF术,腰椎间盘,,,腰|MIS-TLIF术,腰椎间盘
5,408495,刘清华,女,55,60.3,156.5,妇产科病区,腹腔镜下全子宫+双侧附件切除术+盆腔淋巴结清扫术,腹腔镜|下全|子宫|双侧|附件|切除术|盆腔|淋巴结|清扫术,子宫内膜腺癌,子宫|内膜|腺癌,252566,2.866667,腹腔镜|下全|子宫|双侧|附件|切除术|盆腔|淋巴结|清扫术,子宫|内膜|腺癌,下全|子宫|双侧|附件|盆腔|清扫术,,腹腔镜|下全|子宫|双侧|附件|切除术|盆腔|淋巴结|清扫术,子宫|内膜|腺癌
6,408496,童明贵,男,54,46.2,164.0,骨科病区,微创下经关节突椎间融合术（MIS-TLIF术）,微创|下经|关节|突椎间|融合术|MIS-TLIF术,腰椎管狭窄伴腰椎间盘突出,腰椎|狭窄|伴|腰椎间盘,252589,2.433333,微创|下经|关节|突椎间|融合术|MIS-TLIF术,腰椎|狭窄|伴|腰椎间盘,,,微创|下经|关节|突椎间|融合术|MIS-TLIF术,腰椎|狭窄|伴|腰椎间盘
7,408497,向家芬,女,76,62.0,150.0,肝胆外科病区,肝胆总管切开取石术+空肠RouxY吻合术,肝胆|总管|切开|取石术|空肠|ROUXY|吻合术,1、返流性胆管炎；2、胆总管十二指肠吻合术后；3、胆囊切除术后；4、高血压,返流性|胆管炎|胆总管|十二指肠|吻合术|后|胆囊|切除术|后|高血压,252598,5.158333,总管|切开|取石术|空肠|吻合术,胆管炎|胆总管|十二指肠|后|胆囊|后|高血压,肝胆|总管|取石术|空肠|ROUXY|吻合术,返流性|胆管炎|胆总管|十二指肠|吻合术|胆囊|切除术|高血压,总管|切开|取石术|空肠|吻合术,胆管炎|胆总管|十二指肠|后|胆囊|切除术|后|高血压
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3613,416161,何明睿,男,5,21.0,118.0,神经外科病区,全脑血管造影术,全|脑血管|造影术,脑出血,脑出血,257868,1.591667,全|脑血管|造影术,脑出血,脑血管|造影术,脑出血,全|脑血管|造影术,脑出血
3617,416196,朱萍,女,61,65.0,170.0,骨科病区,锁骨骨折切开复位内固定术,锁骨|骨折|切开|复位|内|固定术,左锁骨中段骨折,左|锁骨|中段|骨折,257948,1.741667,骨折|切开|复位|内|固定术,左|中段|骨折,,中段,锁骨|骨折|切开|复位|内|固定术,左|锁骨|中段|骨折
3618,416204,田茂荣,男,52,72.0,168.5,泌尿外科中心病区,阴茎海绵体损伤修补术,阴茎|海绵体|损伤|修补术,阴茎海绵体损伤,阴茎|海绵体|损伤,257949,1.608333,阴茎|损伤|修补术,阴茎|损伤,阴茎|海绵体|修补术,阴茎|海绵体,阴茎|损伤|修补术,阴茎|损伤
3624,416230,王璐,女,52,48.0,153.0,骨科病区,左侧全髋关节置换术后脱位复位,左侧|全|髋关节|置换术|后|脱位|复位,左侧全髋关节置换术后脱位,左侧|全|髋关节|置换术|后|脱位,257974,0.625000,左侧|全|髋关节|置换术|后|脱位|复位,左侧|全|髋关节|置换术|后|脱位,置换术,全,左侧|全|髋关节|置换术|后|脱位|复位,左侧|全|髋关节|置换术|后|脱位


In [83]:
df_ori.to_excel('./data/data_regression.xlsx')