In [1]:
import pandas as pd
import jieba
from pprint import pprint
import re

In [7]:
# 加载停用词表
def load_stopwords():
    path = './stopwords/final_stopwords.txt'
    f = open(path,encoding='utf-8')
    stopwords = set()
    for line in f:
        stopwords.add(line.strip())
    f.close()
    return stopwords

# 预处理句子
def handle_text(text):
    text = str(text) #强制转为字符串
    #转换罗马字母为中文数字，不转阿拉伯数字是为了避免分割
    text = re.sub(r'Ⅲ|III','三',text)
    text = re.sub(r'Ⅱ|II','二',text)
    text = re.sub(r'Ⅳ','四',text)
    text = re.sub(r'3级','三级',text)
    text = re.sub(r'2级','二级',text)
    text = re.sub(r'1级','一级',text)
    text = re.sub(r'2型','二型',text)
    text = re.sub(r'1型','一型',text)
    text = re.sub(r'[I1]期','一期',text)
    #把小写字母转成大写字母
    text = text.upper()
    #所有的非中文字符和大写字母的用"#"替代作为分隔符
    not_chinese = re.compile(r'[^\u4e00-\u9fa5A-Z-]+')
    text = re.sub(not_chinese,'#',text) 
    return text

# 从以"#"连接的文本中提取专有名词
def extra_dict(text):
    ans = set()
    #提取出“级”和“型”和其前面的数字或者字母单独拿出来 (用于扩充词典)
    #把字母形成的单元拿出来用于扩充词典
    pattern1 = re.compile(r'[\d|A-z]+[级|型|期]')
    pattern2 = re.compile(r'[A-Z]+[-]?[A-Z]+')
    lis = pattern1.findall(text)
    lis.extend(pattern2.findall(text))
    for w in lis:
        ans.add(w)
    return ans

# 分词后对单词再做一些合并处理,输入以分词列表的形式
def combine_word(w_list,anchor):
    length = len(w_list)
    if length <= 1:
        return w_list
    ans = [w_list[0]]
    for i in range(1,length):
        if w_list[i] == anchor:
            ans.append(ans.pop(-1) + w_list[i])
        else:
            ans.append(w_list[i])
    return ans

# 分词以"#"连接的文本
def cut(text,stopwords):
    ans = []
    for item in text.split('#'):
        if item != '':
            ans.extend(jieba.cut(item,use_paddle=False))
    # 处理分词后的单词
    ans = combine_word(ans,'术')
    ans = combine_word(ans,'段')
    return '|'.join([w for w in ans if w not in stopwords])

# 往jieba库中添加或者删除以集合表示的词语扩展词典
def add_word(word_set):
    if len(word_set) > 0:
        for w in word_set:
            jieba.add_word(w,1000000)
        pprint('添加词语到词典成功')
    else:
        pprint('没有词语可以添加到词典')
            
def remove_word(word_set):
    if len(word_set) > 0:
        for w in word_set:
            jieba.del_word(w)
        pprint('从词典删除词语成功')
    else:
        pprint('没有词语可以删除')
        
# 分词 返回一个列表
def cutall(corpus):
    ans = []
    # 初始化专有名词词典
    extracted_dict.clear()

    # 构建词典
    for text in corpus:
        s = handle_text(text)
        tmp_dict = extra_dict(s)
        if len(tmp_dict) > 0:
            # 添加专有名词到词典中
            for w in tmp_dict:
                extracted_dict.add(w)
    pprint('构建的词典长度：' + str(len(extracted_dict)))
    # 分词
    add_word(extracted_dict)
    try:

        for text in corpus:
            s = handle_text(text)
            ans.append(cut(s,stopwords))
#             pprint(text + '--->'+cut(s,stopwords))
        #删除词典
        remove_word(extracted_dict)
    except:
        pprint('分词出错')
        remove_word(extracted_dict)
    pprint('分词后的文本长度：' + str(len(ans)))
    return ans

# 加载初始的人工词典--用来添加算法无法发现的词语
def add_manual_dict(manual_dict):
    for w in manual_dict:
        jieba.add_word(w)

In [3]:
# 使用第三方医疗词典
jieba.load_userdict('./med_dict/med_dict.txt')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\xwd\AppData\Local\Temp\jieba.cache
Loading model cost 0.719 seconds.
Prefix dict has been built successfully.


In [5]:
manual_dict = {'造瘘术'}

path = './data/data_without_na_01.xlsx'
df = pd.read_excel(path)
pprint(df.shape)

# 手动扩展词典
add_manual_dict(manual_dict)

# 加载停用词
stopwords = load_stopwords()
print('停用词表长度:' + str(len(stopwords)))

(3629, 28)
停用词表长度:2126


In [6]:
corpus_diagnose_all = df['术前诊断']
corpus_opearation_all = df['实施手术']

In [8]:
# 分词术前诊断
extracted_dict = set()
diagnose_cut = cutall(corpus_diagnose_all)

'构建的词典长度：48'
'添加词语到词典成功'
'从词典删除词语成功'
'分词后的文本长度：3629'


In [9]:
pprint(extracted_dict)

{'A型',
 'A期',
 'BENTALL',
 'B期',
 'CA',
 'CIN',
 'CINI',
 'CKD',
 'CM',
 'CUSHING',
 'DDH',
 'ESD',
 'FNH',
 'FREIBERG',
 'GDM',
 'GGN',
 'GGO',
 'GNRH',
 'GS',
 'HELIP',
 'HISL',
 'HPV',
 'IB',
 'IB期',
 'ICP',
 'IGA',
 'IUD',
 'IV',
 'IVF-ET',
 'IV级',
 'LISFRANCE',
 'LST',
 'MD',
 'OB',
 'PCI',
 'PCNL',
 'PDA',
 'PSA',
 'RATHKE',
 'RH',
 'RMCA',
 'RP',
 'STANFORD',
 'TACE',
 'TFCC',
 'TIA',
 'VIN',
 'WG'}


In [10]:
# 分词实施手术
opearation_cut = cutall(corpus_opearation_all)

'构建的词典长度：26'
'添加词语到词典成功'
'从词典删除词语成功'
'分词后的文本长度：3629'


In [11]:
pprint(extracted_dict)

{'ACCF',
 'ACDF',
 'BENTALL',
 'CAGE',
 'DIXON',
 'DSA',
 'ENBLOC',
 'IABP',
 'MED',
 'MIS',
 'MIS-TLIF',
 'O-LIF',
 'PEID',
 'PELD',
 'PKP',
 'PKRP',
 'PLIF',
 'PVP',
 'ROUXY',
 'TATME',
 'TLIF',
 'TURP',
 'VSD',
 'WHIPPLE',
 'Z-TLIF',
 'ZELIF'}
