In [1]:
import time
import random
import os
import re
import pandas as pd
import numpy as np
import pickle as pkl
from datetime import datetime
from elasticsearch import Elasticsearch
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
from elasticsearch import Elasticsearch

In [8]:
import jieba.posseg as psg
import pickle

In [9]:
def distance(a, b):
    #  计算两个vector的距离，逐项差的平方和，不开根号
    if len(a) != len(b):
        print('wrong')
        return 0
    else:
        temp = a - b
        temp = temp ** 2
        res = sum(temp)
        return res
def get_dist_mat(feat):
    #  输入为特征矩阵，返回一个相似度矩阵（行数乘行数）
    print(type(feat))
    length=feat.shape[0]#样本数量
    wide = feat.shape[1]#特征向量长度
    res = np.zeros(shape=(length, length))
    ave = np.zeros(shape=(length, length, wide))

    for i in range(0, length):
        for j in range(i+1, length):
            ave[i, j] = np.mean(feat[i: j+1], axis=0)
            # print('i = %d, j = %d' % (i, j))

    for i in range(0, length):
        for j in range(i+1, length):
            ave_one = ave[i, j]
            dist_lst = [distance(ave_one, one) for one in feat[i: j+1]]
            res[i, j] = sum(dist_lst)
            # print('i = %d, j = %d' % (i, j))
    return res

def get_class_devide(dist_mat, class_num):
    #  利用递推公式计算分成class_num类别的分割方法
    length = len(dist_mat)
    divided_point = np.zeros(shape=(length, class_num+1))  # 存切割点
    diveded_dist = np.zeros(shape=(length, class_num+1))  # 存切割最小距离

    for i in range(1, length):  # 分成两类
        dist_lst = [dist_mat[0, k] + dist_mat[k+1, i] for k in range(0, i)]
        divided_point[i, 2] = np.argmin(dist_lst)
        diveded_dist[i, 2] = np.min(dist_lst)

    for classes in range(3, class_num+1):   # 分成多类
        for i in range(classes-1, length):
            dist_lst = [diveded_dist[k, classes-1] + dist_mat[k+1, i] for k in range(classes-2, i)]  # 从n-1类到n类
            divided_point[i, classes] = np.argmin(dist_lst) + classes - 2
            diveded_dist[i, classes] = np.min(dist_lst)
    return diveded_dist, divided_point

def get_trace(trace_mat, class_num):
    #  获得切割方法list，
    #  trace_mat就是get_class_devide函数返回的divided_point，分割点记录矩阵
    lst = []
    length = len(trace_mat)  #length=38
    pre = length - 1   #pre=37
    for i in range(class_num, 1, -1):
        temp = int(trace_mat[pre, i])
        pre = temp
        lst.append(temp)
    lst.reverse()
    return lst
def get_one_essay_trace(feature,phrase_num):
    #  整合调用 有序聚类方法
    #  调用就可以获得一个文件的分割结果并存入本地

    time_start = time.time()
    dist_mat = get_dist_mat(feature)
    phrase_num = phrase_num
    min_dist_mat, min_trace_mat = get_class_devide(dist_mat, phrase_num)
    # print(min_trace_mat)

    lst = get_trace(min_trace_mat, phrase_num)
    lst.append(len(feature))
    print('划分结果为：',end='')
    print(lst)
    time_end = time.time()
    print('cost time', time_end - time_start)
    return lst,min_dist_mat,min_trace_mat
def observe_divided(lst, sents):
    # 展示分割结果
    split_result=[]
    start = 0
    for end in lst:
        cla_lst=[]
        if end==len(sents):
            end=end-1
        for i in range(start, end+1):
            cla_lst.append(sents[i])
        start = end + 1
        split_result.append(cla_lst)
    return split_result

In [10]:
class esPaperRetrieval():
    '''
    根据论文检索需求进行功能的微调
    '''
    _instance = None
    _first_init = True
    def __new__(cls, *args, **kw):
        if not cls._instance:
            cls._instance = super(esPaperRetrieval, cls).__new__(cls)  
        return cls._instance
    
    def __init__(self, host, port):
        '''
        使用ES进行论文检索 指定host、port以及专利index之后进行检索
        '''
        super(esPaperRetrieval, self).__init__()
        self.es = Elasticsearch(hosts=host, port=port, timeout=30, max_retries=10, retry_on_timeout=True)
        self.indexName = 'paper-detail-index'

    def do_search(self, titleQuery, kwordQuery, summaryQuery ,pubQuery,fromDate, toDate, volume):
        '''
        do_search方法执行具体检索过程
        titleQuery 为对应标题检索词
        kwordQuery 为对应关键字检索词
        summaryQuery 为对应摘要检索词
        上述三个词为or模式 可以出现 可以不出现
        pubQuery 为对应文摘关键词 这个关键词必定匹配
        fromDate 为检索字段起始日期 toDate为终结日期 日期格式 yyyy-mm-dd
        volume为每次检索返回的数目
        '''
        queryBody = {
          "query": {
            "bool": {
              "should": [
                {
                  "term": {
                    "P_Title": titleQuery
                  }
                },
                {
                  "term": {
                    "P_Keyword": kwordQuery
                  }
                },
                {
                  "term": {
                    "P_Summary": summaryQuery
                  }
                },
                  {
                  "term": {
                    "P_Publication.keyword": {"value":pubQuery}
                  }
                  }
              ],
              "filter": [
                {
                  "range": {
                    "P_year": {
                      "gt": fromDate,
                      "lt": toDate
                    }
                  }
                },
              ]
            }
          },
            "from": 0,
            "size": volume,
            "sort": [],
            "aggs": {}
        }
#         print(queryBody)
        result = self.es.search(index=self.indexName, body=queryBody)
        return result

    def format_search(self, result):
        '''
        format_search方法对检索结果进行格式化 构建符合要求的字段进行返回
        输入result为检索结果 提取其中的检索结果进行后处理
        使用ES检索后得到的结果中result['hits']['hits']为数组格式数据
        其中每一个元素为一个dict 对应部分字段
        '''
        docs = result['hits']['hits']
        docs = [i['_source'] for i in docs]
        targetKeyList = 'P_ID, P_Title, P_Author, P_Publication, P_Organ, P_year, P_Keyword,     P_Summary, P_Keyword_seg, P_Title_seg,    P_Summary_seg, P_URL, P_Fields, P_Fields_two,P_References, P_Pagecount, P_Page, P_Language,    P_Download_num, P_Citation_num,P_Vector,P_Volume, P_Issue,P_Issn,P_Isbn, P_Doi,    P_Red1, P_Red2, P_Red3, P_Red4, P_Red5'
        targetKeyList = [i.strip() for i in targetKeyList.split(',')]
        dict_filter_by_keys = lambda d: {k: d[k] for k in targetKeyList}
        dict_filter_text = lambda d: {k if not k == 'text' else 'claim_text': d[k] for k in d}
        dict_filter_id = lambda d: {k if not k == '_id' else 'id': d[k] for k in d}
        docs = (dict_filter_by_keys(doc) for doc in docs)
        docs = [dict_filter_text(doc) for doc in docs]
        return docs
    def Retrieval(self, titleQuery, kwordQuery, summaryQuery ,pubQuery,fromDate, toDate, volume):
        result = self.do_search(titleQuery, kwordQuery, summaryQuery ,pubQuery,fromDate, toDate, volume)
        docs = self.format_search(result)
        return docs



In [11]:
def tongjiau(li):
    authors = []
    for i in li:
        try:
            authors += i
        except:
            continue
    return list(set(authors))

def savepickle(name, data):
    output = open('%s.pkl' % (name), 'wb')
    pkl.dump(data, output)
    output.close()

def loadpickle(name):
    pkl_file = open('%s.pkl' % (name), 'rb')
    data1 = pkl.load(pkl_file)
    return data1

def get_data(key, from_date, to_date, volume):
    es4Paper = esPaperRetrieval(host='10.8.128.205',port=49200)
    data= es4Paper.Retrieval(titleQuery=key,kwordQuery=key,summaryQuery=key,pubQuery='',
              fromDate=from_date,toDate=to_date,volume=volume)
    data = pd.DataFrame(data)[["P_Author", "P_Publication", "P_Summary_seg", "P_Keyword", "P_year"]]
    data.columns = ["authors", "Jounal", "seg_abstract", "keywords", "year"]
    data.year = [i[0:4] for i in data.year]

    group_df = data.groupby(by=['year'])
    # 统计每年的论文数量
    years = list(set(data['year']))
    # years.remove(None)
    years = sorted([int(i) for i in years])
    year_num = []
    for y in years:
        try:
            year_num.append(len(group_df.get_group(str(y))))
        except:
            year_num.append(0)

    # 根据年份将文献信息整合并保存

    years_df = pd.DataFrame()
    years_df['year'] = years
    seg = []
    key = []
    jou = []
    authors = []
    for i in years:
        seg.append(' '.join(list(group_df.get_group(str(i))['seg_abstract'])))
        k = list(group_df.get_group(str(i))['keywords'])
        if None in k:
            while None in k:
                k.remove(None)
            key.append(' '.join(k))
        else:
            key.append(' '.join(k))
        jou.append(' '.join(list(group_df.get_group(str(i))['Jounal'])))
        authors.append(' '.join(tongjiau(list(group_df.get_group(str(i))['authors']))).split())
    years_df['seg_abstract'] = seg
    years_df['keywords'] = key
    years_df['authors'] = authors
    years_df['Jounal'] = jou
    years_df['paper_num'] = year_num
    savepickle('year_df', years_df)
    return years_df

def print_top_words_list(model, feature_names, n_top_words):
    result = []
    for topic_idx, topic in enumerate(model.components_):
        result += [[feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]]
    return result

def tf_lda_list(n_topic, n_top_words, max_feature, data):
    vectorizer = CountVectorizer(max_features=max_feature)
    X = vectorizer.fit_transform(data)
    X = X.toarray()
    # 主题词个数
    n_top_words = n_top_words
    n_samples = X.shape[0]
    n_features = X.shape[1]
    print("Fitting LDA models with tf features, "
          "n_samples=%d and n_features=%d..."
          % (n_samples, n_features))
    lda = LatentDirichletAllocation(n_components=n_topic, max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)

    print("\nTopics in LDA model:")
    lda.fit(X)
    tf_feature_names = vectorizer.get_feature_names()
    return print_top_words_list(lda, tf_feature_names, n_top_words)


# In[55]:


def zhutifenxi( key, n_topic=3, n_top_words=30, year_from=2010, year_to=2018, volume=500):
    years_df = get_data(key, str(year_from-1)+"-12-31", str(year_to)+"-12-31", volume)
    max_feature = 2000
    result = []
    result += tf_lda_list(n_topic, n_top_words, max_feature, years_df.loc[years_df.year.isin(range(year_from, year_to+1))]['seg_abstract'])
    value = [[100] * n_top_words for i in range(n_topic)]
    key = [[("name", "value")] * n_top_words for i in range(n_topic)]
    
    return [[dict(zip(t[0],t[1])) for t in k]  for k in [list(zip(j[0], j[1])) for j in list(zip(key, [list(zip(i[0], i[1])) for i in list(zip(result, value))]))]]


def build_tfidf(max_feature, data):
    #     vectorizer=CountVectorizer()
    tfidf = TfidfVectorizer(max_features=max_feature)
    f_tfidf = tfidf.fit_transform(data)
    word = tfidf.get_feature_names()
    f_tfidf = f_tfidf.toarray()
    return f_tfidf, word

def print_top_words(model, feature_names, n_top_words):
    result_str = ''
    for topic_idx, topic in enumerate(model.components_):
        result_str += "Topic #%d:\n" % topic_idx
        result_str += " ".join([feature_names[i]
                                for i in topic.argsort()[:-n_top_words - 1:-1]]) + '\n'
    return result_str

def tfidf_lda(n_topic, n_top_words, max_feature, data):
    tfidf = TfidfVectorizer(max_features=max_feature)
    f_tfidf = tfidf.fit_transform(data)
    f_tfidf = f_tfidf.toarray()
    # 主题词个数
    n_top_words = n_top_words
    n_samples = f_tfidf.shape[0]
    n_features = f_tfidf.shape[1]

    lda = LatentDirichletAllocation(n_components=n_topic, max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)

#     print("\nTopics in LDA model:")
    lda.fit(f_tfidf)
    tf_feature_names = tfidf.get_feature_names()
    return print_top_words_list(lda, tf_feature_names, n_top_words)

def youxujulei(key, n_class=4, n_topic=1, n_top_topic=30):
    years_df = get_data(key, "2010-12-31", "2019-12-31", 1000)
    # 设置特征维度
    max_feature = 2000
    # 根据年份建立tfidf表
    year_tfidf = build_tfidf(max_feature, years_df['seg_abstract'])
    # 根据年份建立tf表
    # year_tf=build_tf(max_feature,years_df['seg_abstract'])

    # 有序聚类

    # 聚类数量
    lst = get_one_essay_trace(year_tfidf[0], n_class)
    split_result = observe_divided(lst[0], years_df['year'])

    split_df = pd.DataFrame()
    abstract_agg = []
    abstracts = []
    for i in split_result:
        tmp = []
        for y in i:
            tmp.append(list(years_df.loc[years_df.year == y]['seg_abstract'])[0])
        abstract_agg.append(' '.join(tmp))
        abstracts.append(tmp)
    split_df['class'] = split_result
    split_df['merge_abstract'] = abstract_agg
    split_df['abstracts'] = abstracts

    # 根据tf计算LDA主题词
    # 算法 类名/第i类
    result_str = []
    keys = ["pub_id", "title", "publicationDate", "firstApplicant"]
    for i in range(n_class):
        t = 1
        for j in tf_lda_list(n_topic, n_top_topic, max_feature, split_df.loc[i, 'abstracts']):
            result_str += [dict(zip(keys,["tf-lda", str(split_df['class'][i]) + "年", "主题"+str(t),  " ".join(j)]))]
            t += 1

    # 根据tfidf计算LDA主题词
    for i in range(n_class):
        t = 1
        for j in tfidf_lda(n_topic, n_top_topic, max_feature, split_df.loc[i, 'abstracts']):
            result_str += [dict(zip(keys,["tfidf-lda", "第"+str(i+1)+"类", "主题"+str(t),  " ".join(j)]))]
            t += 1

    return result_str

# 有序聚类

In [13]:
keys_map = {"pub_id":'algorithmName', "title":'className', "publicationDate":'topicName', "firstApplicant":'keywords'}

result = youxujulei('中医')

result = [{keys_map[k]:i[k] for k in i} for i in result]

In [16]:
result

[{'algorithmName': 'tf-lda',
  'className': '[2011, 2012]年',
  'topicName': '主题1',
  'keywords': '患者 方法 结果 结论 目的 10 治疗 进行 分别 具有 明显 探讨 12 统计学意义 000 差异 显示 研究 以及 采用 临床 其中 癫癎 临床表现 发生 对照组 出现 观察 05 13'},
 {'algorithmName': 'tf-lda',
  'className': '[2013]年',
  'topicName': '主题1',
  'keywords': '患者 研究 方法 临床 脑脊液 结果 背景 结论 10 结核性脑膜炎 具有 000 12 治疗 显示 本文 进行 临床表现 动脉瘤 颅内动脉瘤 出现 分别 诊断 探讨 发生 通过 差异 中枢神经系统 14 目前'},
 {'algorithmName': 'tf-lda',
  'className': '[2014, 2015]年',
  'topicName': '主题1',
  'keywords': '05 患者 结果 方法 结论 目的 进行 表达 显著 对照组 统计学意义 差异 采用 分别 10 影响 明显 比较 检测 探讨 12 细胞 01 高于 分析 il 水平 分为 观察 以及'},
 {'algorithmName': 'tf-lda',
  'className': '[2017]年',
  'topicName': '主题1',
  'keywords': '患者 000 10 方法 结果 结论 评价 采用 目的 进行 12 评分 可以 以及 显示 术后 探讨 17 训练 对照组 18 包括 13 16 临床 存在 治疗 23 其中 高于'},
 {'algorithmName': 'tfidf-lda',
  'className': '第1类',
  'topicName': '主题1',
  'keywords': '范围 脱位 患者 表现 敏感性 许旺细胞 st 奥卡西平 结果 影像学 少年期 之一 人民 多方面 表达水平 prom 荧光血管造影 90 具体 复位 cochrane图书馆 03 随后 普瑞巴林 脑梗死 75 具有 保加利亚乳酸杆菌 广泛 发病'},
 

# 主题聚类

In [21]:
result = zhutifenxi('中医',n_topic=7)

result = [{'topicID':num,'topicWords':i} for num,i in enumerate(result)]

Fitting LDA models with tf features, n_samples=4 and n_features=2000...

Topics in LDA model:


In [23]:
result

[{'topicID': 0,
  'topicWords': [{'name': '患者', 'value': 100},
   {'name': '000', 'value': 100},
   {'name': '方法', 'value': 100},
   {'name': '结果', 'value': 100},
   {'name': '结论', 'value': 100},
   {'name': '目的', 'value': 100},
   {'name': '研究', 'value': 100},
   {'name': '10', 'value': 100},
   {'name': '12', 'value': 100},
   {'name': '探讨', 'value': 100},
   {'name': '显示', 'value': 100},
   {'name': '治疗', 'value': 100},
   {'name': '差异', 'value': 100},
   {'name': '具有', 'value': 100},
   {'name': '统计学意义', 'value': 100},
   {'name': '临床表现', 'value': 100},
   {'name': '术后', 'value': 100},
   {'name': '评价', 'value': 100},
   {'name': '评分', 'value': 100},
   {'name': '17', 'value': 100},
   {'name': '以及', 'value': 100},
   {'name': '采用', 'value': 100},
   {'name': '临床', 'value': 100},
   {'name': '13', 'value': 100},
   {'name': '25', 'value': 100},
   {'name': '00', 'value': 100},
   {'name': '其中', 'value': 100},
   {'name': '11', 'value': 100},
   {'name': '18', 'value': 100},
   {'na

In [24]:
'2009-01-01'[:4]

'2009'