In [9]:
import pandas as pd
import math
import numpy as np

In [2]:
path = './data/data_regression.xlsx'
df = pd.read_excel(path)
df.shape

(3629, 19)

In [10]:
# 获取语料 返回一个字典
def get_corpus(df):
    ans = {}
    # 去除空值
    print('样本个数：{}'.format(df.shape[0]))
    df_dup = df.dropna(how='all',subset=['实施手术_分词','术前诊断_分词']).copy()
    print('去除空值后的样本个数：{}'.format(df_dup.shape[0]))
    # 把含有空值的单元变为空字符
    df_dup.fillna(value='',inplace=True)
    
    operation_ids = list(df_dup['手术ID'])
    
    for item,operation_id in zip(list(df_dup['实施手术_分词'] +'|'+ df_dup['术前诊断_分词']),operation_ids):
        # print(item)
        ans[operation_id] = []
        for w in item.split('|'):
            if w != '':
                ans[operation_id].append(w)
    print('语料长度：{}'.format(len(ans.items())))
    return ans

# 计算一个词语的tfidf值
def tf_idf(corpus):
    ans = {}
    D = len(corpus.items())
    print('文档总数：{}'.format(D))
    count = 1
    for operation_id,words in corpus.items():
        if count % 500 == 0:
            print(count)
        ans[operation_id] = {}
        for w in words:
            TF = words.count(w)
            d = len([item for item in corpus.values() if w in item])
            ans[operation_id][w] = TF * math.log10((D+1)/(d+1) + 1)
        count += 1
    return ans

In [5]:
corpus = get_corpus(df)

样本个数：3629
去除空值后的样本个数：3629
语料长度：3629


In [11]:
tf_idf = tf_idf(corpus)

文档总数：3629
500
1000
1500
2000
2500
3000
3500


In [13]:
# 模型保存和加载
np.save('./tf_idf/tf_idf.npy',tf_idf)

In [15]:
t = np.load('./tf_idf/tf_idf.npy',allow_pickle=True)

In [18]:
t

array({408485: {'二次': 1.5276647294115475, '剖宫产术': 1.2505555287515302, '妊娠': 1.130888069121501, '周孕': 1.2925968281271185, '产': 1.2821687783046416, '晚孕': 1.6210361717140598, '瘢痕': 1.5802498279894488, '子宫': 1.026477992226907, '右侧': 0.9696808850334337, '附件': 1.593402671795464, '囊肿': 1.5201413836310023}, 408486: {'房颤': 2.1160856697981787, '消融': 2.385606273598312, '改良': 1.8500804405331968, '迷宫术': 2.3314868419355075, '人工': 1.1537592461742663, '二尖瓣': 1.3326096104525607, '置换术': 1.0828182976057417, '风心病': 1.7748700028556206}, 408487: {'二尖瓣': 1.3326096104525607, '置换术': 2.1656365952114833, '心脏': 2.148032839090976, '瓣膜': 4.522525737584987, '后': 1.4213507677359336, '功能障碍': 2.958324931644053}, 408491: {'经皮椎间': 2.182557301304913, '孔镜': 2.0863598306747484, '下腰': 2.4154503326227226, '骶': 1.754701086087989, '椎间盘': 1.6024100876644571, '髓': 1.8260748027008264, '核': 1.8339258617018428, '切除术': 0.6449172896891979, '腰椎': 1.2821687783046416, '狭窄': 1.2449037962544727, '症': 1.2058061287440867}, 408494: {'腰': 1.40

In [22]:
t.item()

{408485: {'二次': 1.5276647294115475,
  '剖宫产术': 1.2505555287515302,
  '妊娠': 1.130888069121501,
  '周孕': 1.2925968281271185,
  '产': 1.2821687783046416,
  '晚孕': 1.6210361717140598,
  '瘢痕': 1.5802498279894488,
  '子宫': 1.026477992226907,
  '右侧': 0.9696808850334337,
  '附件': 1.593402671795464,
  '囊肿': 1.5201413836310023},
 408486: {'房颤': 2.1160856697981787,
  '消融': 2.385606273598312,
  '改良': 1.8500804405331968,
  '迷宫术': 2.3314868419355075,
  '人工': 1.1537592461742663,
  '二尖瓣': 1.3326096104525607,
  '置换术': 1.0828182976057417,
  '风心病': 1.7748700028556206},
 408487: {'二尖瓣': 1.3326096104525607,
  '置换术': 2.1656365952114833,
  '心脏': 2.148032839090976,
  '瓣膜': 4.522525737584987,
  '后': 1.4213507677359336,
  '功能障碍': 2.958324931644053},
 408491: {'经皮椎间': 2.182557301304913,
  '孔镜': 2.0863598306747484,
  '下腰': 2.4154503326227226,
  '骶': 1.754701086087989,
  '椎间盘': 1.6024100876644571,
  '髓': 1.8260748027008264,
  '核': 1.8339258617018428,
  '切除术': 0.6449172896891979,
  '腰椎': 1.2821687783046416,
  '狭窄': 1.244