In [1]:
import pandas as pd
import numpy as np
import json
import os
import csv
import re,time

#导入PCA算法库
from textrank4zh import TextRank4Keyword, TextRank4Sentence
from sklearn import ensemble
import math
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn import neighbors
from sklearn import ensemble
from sklearn.externals import joblib
from elasticsearch import Elasticsearch 

In [2]:
def distance(a, b):
    #  计算两个vector的距离，逐项差的平方和，不开根号
    if len(a) != len(b):
        print('wrong')
        return 0
    else:
        temp = a - b
        temp = temp ** 2
        res = sum(temp)
        return res
def get_dist_mat(feat):
    #  输入为特征矩阵，返回一个相似度矩阵（行数乘行数）
    print(type(feat))
    length=feat.shape[0]#样本数量
    wide = feat.shape[1]#特征向量长度
    res = np.zeros(shape=(length, length))
    ave = np.zeros(shape=(length, length, wide))

    for i in range(0, length):
        for j in range(i+1, length):
            ave[i, j] = np.mean(feat[i: j+1], axis=0)
            # print('i = %d, j = %d' % (i, j))

    for i in range(0, length):
        for j in range(i+1, length):
            ave_one = ave[i, j]
            dist_lst = [distance(ave_one, one) for one in feat[i: j+1]]
            res[i, j] = sum(dist_lst)
            # print('i = %d, j = %d' % (i, j))
    return res

def get_class_devide(dist_mat, class_num):
    #  利用递推公式计算分成class_num类别的分割方法
    length = len(dist_mat)
    divided_point = np.zeros(shape=(length, class_num+1))  # 存切割点
    diveded_dist = np.zeros(shape=(length, class_num+1))  # 存切割最小距离

    for i in range(1, length):  # 分成两类
        dist_lst = [dist_mat[0, k] + dist_mat[k+1, i] for k in range(0, i)]
        divided_point[i, 2] = np.argmin(dist_lst)
        diveded_dist[i, 2] = np.min(dist_lst)

    for classes in range(3, class_num+1):   # 分成多类
        for i in range(classes-1, length):
            dist_lst = [diveded_dist[k, classes-1] + dist_mat[k+1, i] for k in range(classes-2, i)]  # 从n-1类到n类
            divided_point[i, classes] = np.argmin(dist_lst) + classes - 2
            diveded_dist[i, classes] = np.min(dist_lst)
    return diveded_dist, divided_point

def get_trace(trace_mat, class_num):
    #  获得切割方法list，
    #  trace_mat就是get_class_devide函数返回的divided_point，分割点记录矩阵
    lst = []
    length = len(trace_mat)  #length=38
    pre = length - 1   #pre=37
    for i in range(class_num, 1, -1):
        temp = int(trace_mat[pre, i])
        pre = temp
        lst.append(temp)
    lst.reverse()
    return lst
def get_one_essay_trace(feature,phrase_num):
    #  整合调用 有序聚类方法
    #  调用就可以获得一个文件的分割结果并存入本地

    time_start = time.time()
    dist_mat = get_dist_mat(feature)
    phrase_num = phrase_num
    min_dist_mat, min_trace_mat = get_class_devide(dist_mat, phrase_num)
    # print(min_trace_mat)

    lst = get_trace(min_trace_mat, phrase_num)
    lst.append(len(feature))
    print('划分结果为：',end='')
    print(lst)
    time_end = time.time()
    print('cost time', time_end - time_start)
    return lst,min_dist_mat,min_trace_mat
def observe_divided(lst, sents):
    # 展示分割结果
    split_result=[]
    start = 0
    for end in lst:
        cla_lst=[]
        if end==len(sents):
            end=end-1
        for i in range(start, end+1):
            cla_lst.append(sents[i])
        start = end + 1
        split_result.append(cla_lst)
    return split_result

In [3]:
class esPaperRetrieval():
    '''
    根据论文检索需求进行功能的微调
    '''
    _instance = None
    _first_init = True
    def __new__(cls, *args, **kw):
        if not cls._instance:
            cls._instance = super(esPaperRetrieval, cls).__new__(cls)
        return cls._instance

    def __init__(self, host, port):
        '''
        使用ES进行论文检索 指定host、port以及专利index之后进行检索
        '''
        super(esPaperRetrieval, self).__init__()
        self.es = Elasticsearch(hosts=host, port=port, timeout=30, max_retries=10, retry_on_timeout=True)
        self.indexName = 'paper-detail-index'

    def do_search(self, titleQuery, kwordQuery, summaryQuery ,pubQuery,fromDate, toDate, volume):
        '''
        do_search方法执行具体检索过程
        titleQuery 为对应标题检索词
        kwordQuery 为对应关键字检索词
        summaryQuery 为对应摘要检索词
        上述三个词为or模式 可以出现 可以不出现
        pubQuery 为对应文摘关键词 这个关键词必定匹配
        fromDate 为检索字段起始日期 toDate为终结日期 日期格式 yyyy-mm-dd
        volume为每次检索返回的数目
        '''
        queryBody = {
          "query": {
            "bool": {
              "should": [
                {
                  "term": {
                    "P_Title": titleQuery
                  }
                },
                {
                  "term": {
                    "P_Keyword": kwordQuery
                  }
                },
                {
                  "term": {
                    "P_Summary": summaryQuery
                  }
                },
                  {
                  "term": {
                    "P_Publication.keyword": {"value":pubQuery}
                  }
                  }
              ],
              "filter": [
                {
                  "range": {
                    "P_year": {
                      "gt": fromDate,
                      "lt": toDate
                    }
                  }
                },
              ]
            }
          },
            "from": 0,
            "size": volume,
            "sort": [],
            "aggs": {}
        }
#         print(queryBody)
        result = self.es.search(index=self.indexName, body=queryBody)
        return result

    def format_search(self, result):
        '''
        format_search方法对检索结果进行格式化 构建符合要求的字段进行返回
        输入result为检索结果 提取其中的检索结果进行后处理
        使用ES检索后得到的结果中result['hits']['hits']为数组格式数据
        其中每一个元素为一个dict 对应部分字段
        '''
        docs = result['hits']['hits']
        docs = [i['_source'] for i in docs]
        targetKeyList = 'P_ID, P_Title, P_Author, P_Publication, P_Organ, P_year, P_Keyword, \
    P_Summary, P_Keyword_seg, P_Title_seg,\
    P_Summary_seg, P_URL, P_Fields, P_Fields_two,P_References, P_Pagecount, P_Page, P_Language,\
    P_Download_num, P_Citation_num,P_Vector,P_Volume, P_Issue,P_Issn,P_Isbn, P_Doi,\
    P_Red1, P_Red2, P_Red3, P_Red4, P_Red5'
        targetKeyList = [i.strip() for i in targetKeyList.split(',')]
        dict_filter_by_keys = lambda d: {k: d[k] for k in targetKeyList}
        dict_filter_text = lambda d: {k if not k == 'text' else 'claim_text': d[k] for k in d}
        dict_filter_id = lambda d: {k if not k == '_id' else 'id': d[k] for k in d}
        docs = (dict_filter_by_keys(doc) for doc in docs)
        docs = [dict_filter_text(doc) for doc in docs]
        return docs
    def Retrieval(self, titleQuery, kwordQuery, summaryQuery ,pubQuery,fromDate, toDate, volume):
        result = self.do_search(titleQuery, kwordQuery, summaryQuery ,pubQuery,fromDate, toDate, volume)
        docs = self.format_search(result)
        return docs

In [28]:
class pre_word_count():
    def __init__(self,data_path,model_path,hotword_num):
        self.data = pd.read_json(data_path)
        self.data['authors_seg'] = self.data.apply(lambda r: r['P_Red1'] + r['P_Red3'],axis=1)
        self.data['year'] = self.data['P_year'].apply(lambda x:x[:4])
        self.model_path = model_path
        self.hotword_num = hotword_num
    # 统计出现在句子中的词频
    def tf(self,word,sentence):
        c=0
        s=sentence.split()
        for i in s:
            if word==i:
                c+=1
        return c

    # 统计非重复的所有作者
    def tongjiau(self,li):
        authors=[]
        for i in li:
            try:
                authors+=i.split()
            except:
                continue
        return list(set(authors))

    # 判断新作者是否在论文中提及热点词
    def mention(self,authors,word,sentences,senten_auth):
        i=0
        for a in authors:
            for j in range(len(sentences)):
                try:
                    if len(list(set(a).intersection(set(senten_auth[j]))))>0 and word in sentences[j]:
                        i+=1
                except:
                    continue
        return i

    # 统计某个热点词在某年的不同期刊的提及数
    def multi_journal(self,word,sentences,journals):
        men_journal=[]
        for j in range(len(sentences)):
            try:
                if word in sentences[j]:
                    men_journal.append(journals[j])
            except:
                continue
        s=set(men_journal)
        if None in s:
            s.remove(None)
        return len(s)

    # 抽取词频特征
    def feature(self,word,year_df):
        abstract_tf=[]
        keyword_tf=[]
        for y in list(year_df['seg_abstract']):
            abstract_tf.append(self.tf(word,y))
        for i in list(year_df['seg_keywords']):
            keyword_tf.append(self.tf(word,i))
        return abstract_tf,keyword_tf

    #计算list的平均数

    def pingjun(self,L):
        if len(L)==0:
            return 0
        else:
            s=0
            for i in L:
                s+=i

            return int(s/len(L))
    # 获取热点词
    def get_hotwords(self,hotword_num):
        guanjian=' '.join(list(self.data['P_Keyword_seg'])).split()
        #textrank方法获取关键词
        
        #merge=' '.join(guanjian)
        #tr4w = TextRank4Keyword()
        #tr4w.analyze(text=merge, window=4)
        #pred_words=[]
        #for item in tr4w.get_keywords(hotword_num, word_min_len=2):
            #pred_words.append(item.word)
        #根据词频排序获取关键词
        result = Counter(guanjian)
        d = sorted(result.items(), key=lambda x: x[1], reverse=True)
        pred_words=list(map(lambda x:x[0],d))[:hotword_num]
        return pred_words
    # 统计每年发表的论文数
    def year_paper_num(self,):
        year_num=[]
        years=list(set(self.data['year']))
        group_df=self.data.groupby(by=['year'])
        # years.remove(None)
        years=sorted([int(i) for i in years])
        for y in years:
            try:
                year_num.append(len(group_df.get_group(str(y))))
            except:
                continue
        return group_df,years,year_num
    # 按年份将摘要整合
    def groupby_year(self,group_df,years,year_num):
        years_df=pd.DataFrame()
        years_df['year']=years
        seg,key,tit,jou,authors=[],[],[],[],[]

        for i in years:
            seg.append(' '.join(group_df.get_group(str(i))['P_Summary_seg']))
            k=group_df.get_group(str(i))['P_Keyword_seg']
            key.append(' '.join(k))
            tit.append(' '.join(group_df.get_group(str(i))['P_Title_seg']))
            jou.append(' '.join(group_df.get_group(str(i))['P_Publication']))
            authors.append(self.tongjiau(group_df.get_group(str(i))['authors_seg']))
        years_df['seg_abstract'] = seg
        years_df['seg_keywords'] = key
        years_df['seg_title'] = tit
        years_df['seg_authors'] = authors
        years_df['Jounal'] = jou
        years_df['paper_num'] = year_num
        return years_df
    # 统计每年的新作者
    def year_new_author(self,years_df):
        authors = list(years_df['seg_authors'])
        new_author = [authors[0]]
        for i in range(1,len(years_df)):
            new=[]
            for j in authors[i]:
                if j not in authors[i-1]:
                    new.append(j)
            new_author.append(new)
        return new_author
    # 统计每年新作者提及数
    def year_new_author_mention(self,pred_words,new_author,years):
        newauthor_mens=[]
        group_df=self.data.groupby(by=['year'])
        for word in pred_words:
            for i in range(len(new_author)):
                newauthor_men=self.mention(new_author,word,list(group_df.get_group(str(years[i]))['P_Summary']),\
                                           list(map(lambda x:x.split(),list(group_df.get_group(str(years[i]))['authors_seg']))))
                newauthor_mens.append(newauthor_men)
        return newauthor_mens
    # 统计跨学科提及数
    def multi_sub(self,pred_words,years):
        multi_mens = []
        group_df = self.data.groupby(by=['year'])
        for word in pred_words:
            for i in range(len(years)):

                journals=self.multi_journal(word,list(group_df.get_group(str(years[i]))['P_Summary']),\
                                            list(group_df.get_group(str(years[i]))['P_Publication']))
                multi_mens.append(journals)
        return multi_mens
    # 统计每个词初次在关键词中出现的年份
    def first_year(self,pred_words,years_df):
        f_year=[]
        pp=[]
        for w in range(len(pred_words)):
            for y,i in zip(list(years_df['year']),list(years_df['seg_keywords'])):
                if pred_words[w] in i:
                    f_year.append(y)
                    pp.append(w)
                    break
        return f_year
    #获取每个词18年的特征
    def get_feature(self,years_df,pred_words,f_year,newauthor_mens,multi_mens):
        year_num=len(years_df)
        pp=pd.DataFrame()
        for i in range(len(pred_words)):
            word=pred_words[i]
            fyear=f_year[i]
            exist_year=[]
            for n,f in zip(list(years_df['year']),[fyear]*year_num):
                e=n-f
                if e<0:
                    exist_year.append(0)
                else:
                    exist_year.append(e)
            dd=pd.DataFrame()
            dd['word']=[word]*year_num
            dd['f1']=self.feature(word,years_df)[0]
            dd['f2']=self.feature(word,years_df)[1]
            dd['f3'] = exist_year
            dd['f4'] = newauthor_mens[i*year_num:i*year_num+(year_num-1)]+\
            [self.pingjun(newauthor_mens[i*year_num+(year_num-1)-3:i*year_num+(year_num-1)])]

            dd['f6'] = multi_mens[i*year_num:i*year_num+(year_num-1)]+\
            [self.pingjun(multi_mens[i*year_num+(year_num-1)-3:i*year_num+(year_num-1)])]

            if i==0:
                pp=dd
            else:
                pp=pp.append(dd)
        pp=pp.reset_index(drop=True)
        return pp
    #预测某个词的词频
    def zhidingword(self,search_word,test_data,model):
        pre_data=test_data.loc[test_data.word==search_word].iloc[:,1:]
        result = model.predict(pre_data)
        return int(result)
    #返回结果
    def pre_result(self):
        model_GBR=joblib.load(self.model_path)
        pred_words=self.get_hotwords(self.hotword_num)
        years=self.year_paper_num()[1]
        group_df=self.year_paper_num()[0]
        year_num=self.year_paper_num()[2]
        years_df=self.groupby_year(group_df,years,year_num)
        f_year=self.first_year(pred_words,years_df)
        new_author=self.year_new_author(years_df)
        newauthor_mens=self.year_new_author_mention(pred_words,new_author,years)
        multi_mens=self.multi_sub(pred_words,years)
        pp=self.get_feature(years_df,pred_words,f_year,newauthor_mens,multi_mens)

        num=list(range(len(years)-1,len(years)*self.hotword_num,len(years)))
        test_data=pp.iloc[num]
        all_wordtf=[]
        for word in pred_words:
            result=self.zhidingword(word,test_data,model_GBR)
            search_ke=list(pp.loc[pp.word==word]['f1'])+[result]
            all_wordtf.extend(search_ke)
        result=pd.DataFrame()
        years.append(years[-1]+1)
        result['word']=pred_words*len(years)
        result['year']=years*len(pred_words)
        result['count']=all_wordtf
        
        word_2019=result.loc[result['year']==2019]
        word_2019_new = word_2019.sort_values(by="count",ascending=False)
        Top_hot_word=list(word_2019_new.loc[result['year']==2019]['word'])

        hot_year=list(years)
        freq_word=[]
        freq=[]
        freqs=[]
        for www in Top_hot_word:
            freq_word.append(www)
            freq=list(result.loc[result['word']==www].sort_values(by="year",ascending=True)['count'])
            freqs.append(freq)
        draw=pd.DataFrame()
        draw["draw_word"]=freq_word
        draw["draw_freqs"]=freqs
        draw_dict = [{'draw_word':i[1],'draw_freqs':i[2]} for i in draw.itertuples()]
#         draw_json=draw.to_json(orient='index')
        altm = {'topnn':Top_hot_word,'year':hot_year, 'draw':draw_dict}
        return altm

In [33]:
class pre_word_count1():
    def __init__(self, data_path, model_path, hotword_num):
        self.data = pd.read_json(data_path)
        self.data['authors_seg'] = self.data.apply(lambda r: r['P_Red1'] + r['P_Red3'], axis=1)
        self.data['year'] = self.data['P_year'].apply(lambda x: x[:4])
        self.model_path = model_path
        self.hotword_num = hotword_num

    # 统计出现在句子中的词频
    def tf(self, word, sentence):
        c = 0
        s = sentence.split()
        for i in s:
            if word == i:
                c += 1
        return c

    # 统计非重复的所有作者
    def tongjiau(self, li):
        authors = []
        for i in li:
            try:
                authors += i.split()
            except:
                continue
        return list(set(authors))

    # 判断新作者是否在论文中提及热点词
    def mention(self, authors, word, sentences, senten_auth):
        i = 0
        for a in authors:
            for j in range(len(sentences)):
                try:
                    if len(list(set(a).intersection(set(senten_auth[j])))) > 0 and word in sentences[j]:
                        i += 1
                except:
                    continue
        return i

    # 统计某个热点词在某年的不同期刊的提及数
    def multi_journal(self, word, sentences, journals):
        men_journal = []
        for j in range(len(sentences)):
            try:
                if word in sentences[j]:
                    men_journal.append(journals[j])
            except:
                continue
        s = set(men_journal)
        if None in s:
            s.remove(None)
        return len(s)

    # 抽取词频特征
    def feature(self, word, year_df):
        abstract_tf = []
        keyword_tf = []
        for y in list(year_df['seg_abstract']):
            abstract_tf.append(self.tf(word, y))
        for i in list(year_df['seg_keywords']):
            keyword_tf.append(self.tf(word, i))
        return abstract_tf, keyword_tf

    # 计算list的平均数

    def pingjun(self, L):
        if len(L) == 0:
            return 0
        else:
            s = 0
            for i in L:
                s += i

            return int(s / len(L))

    # 获取热点词
    def get_hotwords(self, hotword_num):
        guanjian = ' '.join(list(self.data['P_Keyword_seg'])).split()
        # textrank方法获取关键词

        # merge=' '.join(guanjian)
        # tr4w = TextRank4Keyword()
        # tr4w.analyze(text=merge, window=4)
        # pred_words=[]
        # for item in tr4w.get_keywords(hotword_num, word_min_len=2):
        # pred_words.append(item.word)
        # 根据词频排序获取关键词
        result = Counter(guanjian)
        d = sorted(result.items(), key=lambda x: x[1], reverse=True)
        pred_words = list(map(lambda x: x[0], d))[:hotword_num]
        return pred_words

    # 统计每年发表的论文数
    def year_paper_num(self, ):
        year_num = []
        years = list(set(self.data['year']))
        group_df = self.data.groupby(by=['year'])
        # years.remove(None)
        years = sorted([int(i) for i in years])
        for y in years:
            try:
                year_num.append(len(group_df.get_group(str(y))))
            except:
                continue
        return group_df, years, year_num

    # 按年份将摘要整合
    def groupby_year(self, group_df, years, year_num):
        years_df = pd.DataFrame()
        years_df['year'] = years
        seg, key, tit, jou, authors = [], [], [], [], []

        for i in years:
            seg.append(' '.join(group_df.get_group(str(i))['P_Summary_seg']))
            k = group_df.get_group(str(i))['P_Keyword_seg']
            key.append(' '.join(k))
            tit.append(' '.join(group_df.get_group(str(i))['P_Title_seg']))
            jou.append(' '.join(group_df.get_group(str(i))['P_Publication']))
            authors.append(self.tongjiau(group_df.get_group(str(i))['authors_seg']))
        years_df['seg_abstract'] = seg
        years_df['seg_keywords'] = key
        years_df['seg_title'] = tit
        years_df['seg_authors'] = authors
        years_df['Jounal'] = jou
        years_df['paper_num'] = year_num
        return years_df

    # 统计每年的新作者
    def year_new_author(self, years_df):
        authors = list(years_df['seg_authors'])
        new_author = [authors[0]]
        for i in range(1, len(years_df)):
            new = []
            for j in authors[i]:
                if j not in authors[i - 1]:
                    new.append(j)
            new_author.append(new)
        return new_author

    # 统计每年新作者提及数
    def year_new_author_mention(self, pred_words, new_author, years):
        newauthor_mens = []
        group_df = self.data.groupby(by=['year'])
        for word in pred_words:
            for i in range(len(new_author)):
                newauthor_men = self.mention(new_author, word, list(group_df.get_group(str(years[i]))['P_Summary']), \
                                             list(map(lambda x: x.split(),
                                                      list(group_df.get_group(str(years[i]))['authors_seg']))))
                newauthor_mens.append(newauthor_men)
        return newauthor_mens

    # 统计跨学科提及数
    def multi_sub(self, pred_words, years):
        multi_mens = []
        group_df = self.data.groupby(by=['year'])
        for word in pred_words:
            for i in range(len(years)):
                journals = self.multi_journal(word, list(group_df.get_group(str(years[i]))['P_Summary']), \
                                              list(group_df.get_group(str(years[i]))['P_Publication']))
                multi_mens.append(journals)
        return multi_mens

    # 统计每个词初次在关键词中出现的年份
    def first_year(self, pred_words, years_df):
        f_year = []
        pp = []
        for w in range(len(pred_words)):
            for y, i in zip(list(years_df['year']), list(years_df['seg_keywords'])):
                if pred_words[w] in i:
                    f_year.append(y)
                    pp.append(w)
                    break
        return f_year

    # 获取每个词18年的特征
    def get_feature(self, years_df, pred_words, f_year, newauthor_mens, multi_mens):
        year_num = len(years_df)
        pp = pd.DataFrame()
        for i in range(len(pred_words)):
            word = pred_words[i]
            fyear = f_year[i]
            exist_year = []
            for n, f in zip(list(years_df['year']), [fyear] * year_num):
                e = n - f
                if e < 0:
                    exist_year.append(0)
                else:
                    exist_year.append(e)
            dd = pd.DataFrame()
            dd['word'] = [word] * year_num
            dd['f1'] = self.feature(word, years_df)[0]
            dd['f2'] = self.feature(word, years_df)[1]
            dd['f3'] = exist_year
            dd['f4'] = newauthor_mens[i * year_num:i * year_num + (year_num - 1)] + \
                       [self.pingjun(newauthor_mens[i * year_num + (year_num - 1) - 3:i * year_num + (year_num - 1)])]

            dd['f6'] = multi_mens[i * year_num:i * year_num + (year_num - 1)] + \
                       [self.pingjun(multi_mens[i * year_num + (year_num - 1) - 3:i * year_num + (year_num - 1)])]

            if i == 0:
                pp = dd
            else:
                pp = pp.append(dd)
        pp = pp.reset_index(drop=True)
        return pp

    # 预测某个词的词频
    def zhidingword(self, search_word, test_data, model):
        pre_data = test_data.loc[test_data.word == search_word].iloc[:, 1:]
        result = model.predict(pre_data)
        return int(result)

    # 返回结果
    def pre_result(self):
        model_GBR = joblib.load(self.model_path)
        pred_words = self.get_hotwords(self.hotword_num)
        years = self.year_paper_num()[1]
        group_df = self.year_paper_num()[0]
        year_num = self.year_paper_num()[2]
        years_df = self.groupby_year(group_df, years, year_num)
        f_year = self.first_year(pred_words, years_df)
        new_author = self.year_new_author(years_df)
        newauthor_mens = self.year_new_author_mention(pred_words, new_author, years)
        multi_mens = self.multi_sub(pred_words, years)
        pp = self.get_feature(years_df, pred_words, f_year, newauthor_mens, multi_mens)

        num = list(range(len(years) - 1, len(years) * self.hotword_num, len(years)))
        test_data = pp.iloc[num]
        all_wordtf = []
        for word in pred_words:
            result = self.zhidingword(word, test_data, model_GBR)
            search_ke = list(pp.loc[pp.word == word]['f1']) + [result]
            all_wordtf.extend(search_ke)
        result = pd.DataFrame()
        years.append(years[-1] + 1)
        result['word'] = pred_words * len(years)
        result['year'] = years * len(pred_words)
        result['count'] = all_wordtf

        word_2019 = result.loc[result['year'] == 2019]
        word_2019_new = word_2019.sort_values(by="count", ascending=False)
        Top_hot_word = list(word_2019_new.loc[result['year'] == 2019]['word'])

        hot_year = list(years)
        freq_word = []
        freq = []
        freqs = []
        for www in Top_hot_word:
            freq_word.append(www)
            freq = list(result.loc[result['word'] == www].sort_values(by="year", ascending=True)['count'])
            freqs.append(freq)
        draw = pd.DataFrame()
        draw["draw_word"] = freq_word
        draw["draw_freqs"] = freqs
        draw_dict = [{'draw_word': i[1], 'draw_freqs': i[2]} for i in draw.itertuples()]
        #         draw_json=draw.to_json(orient='index')
        altm = {'topnn': Top_hot_word, 'year': hot_year, 'draw': draw_dict}
        return altm

In [29]:
esPaperObj = esPaperRetrieval(host='10.8.128.205',port=49200)
docs = esPaperObj.Retrieval(titleQuery='',kwordQuery='',summaryQuery='医',pubQuery='',fromDate='2010-01-01',toDate='2019-01-01',volume=1000)

hotPointObj = pre_word_count(json.dumps(docs), '../data/paper_data/paper_hotpoint.model',100)

In [36]:
hotPointObj1 = pre_word_count1(json.dumps(docs), '../data/paper_data/paper_hotpoint.model',50)

In [37]:
result=hotPointObj.pre_result()

In [38]:
result

{'topnn': ['医患纠纷',
  '北京市',
  '文化',
  '医学生',
  '医患信任',
  '医患双方',
  '中医药法',
  '问卷调查',
  '中国医药学',
  '医务人员',
  '和谐',
  '问题',
  '医学人文教育',
  '现状',
  '培养模式',
  '循证医学',
  '县级中医医院',
  '影响因素',
  '医疗质量',
  '医疗纠纷',
  '中医药',
  '人力资源',
  '中医学',
  '医联体',
  '名医经验',
  '医患关系',
  '医德教育',
  '综合医院',
  '中西医结合',
  '医院感染',
  '全科医学',
  '中医院',
  '名老中医',
  '上海',
  '规范化培训',
  '患者',
  '中医学术史',
  '医疗',
  '中医',
  '整合医学',
  '医院建设项目',
  '中西医结合医院',
  '中医药大学',
  '省级',
  '分级诊疗',
  '民族医',
  '北京',
  '公立医院',
  '满意度',
  '实践',
  '临床医师',
  '医疗机构',
  '发展',
  '医生',
  '医师',
  '中医医院',
  '认知',
  '医疗保险',
  '对策',
  '医家',
  '医学伦理',
  '军队医院',
  '管理',
  '临床科室',
  '医学教育',
  '文化建设',
  '过度医疗',
  '建议',
  '医德医风',
  '医疗改革',
  '旴江医学',
  '中医护理',
  '医疗服务',
  'WONCA',
  '医疗费用',
  '眼科医院',
  '思考',
  '中医药特色',
  '医院文化',
  '地方流派',
  '国家中医药管理局',
  '医院',
  '新医改',
  '医籍',
  '医案',
  '医德',
  '调查',
  '医学伦理学',
  '中医医疗',
  '建设',
  '探讨',
  '医学人文',
  '医药',
  '医院管理',
  '创新',
  '社区卫生服务',
  '医患沟通',
  '地域分布',
  '住院医师',
  '中医药文化'],
 'year': [2011, 2012, 2013, 2014,

## 

In [35]:
hotPointObj1.pre_result()

{'topnn': ['医患纠纷',
  '北京市',
  '文化',
  '医学生',
  '医患信任',
  '医患双方',
  '中医药法',
  '问卷调查',
  '中国医药学',
  '医务人员',
  '和谐',
  '问题',
  '医学人文教育',
  '现状',
  '培养模式',
  '循证医学',
  '县级中医医院',
  '影响因素',
  '医疗质量',
  '医疗纠纷',
  '中医药',
  '人力资源',
  '中医学',
  '医联体',
  '名医经验',
  '医患关系',
  '医德教育',
  '综合医院',
  '中西医结合',
  '医院感染',
  '全科医学',
  '中医院',
  '名老中医',
  '上海',
  '规范化培训',
  '患者',
  '中医学术史',
  '医疗',
  '中医',
  '整合医学',
  '医院建设项目',
  '中西医结合医院',
  '中医药大学',
  '省级',
  '分级诊疗',
  '民族医',
  '北京',
  '公立医院',
  '满意度',
  '实践',
  '临床医师',
  '医疗机构',
  '发展',
  '医生',
  '医师',
  '中医医院',
  '认知',
  '医疗保险',
  '对策',
  '医家',
  '医学伦理',
  '军队医院',
  '管理',
  '临床科室',
  '医学教育',
  '文化建设',
  '过度医疗',
  '建议',
  '医德医风',
  '医疗改革',
  '旴江医学',
  '中医护理',
  '医疗服务',
  'WONCA',
  '医疗费用',
  '眼科医院',
  '思考',
  '中医药特色',
  '医院文化',
  '地方流派',
  '国家中医药管理局',
  '医院',
  '新医改',
  '医籍',
  '医案',
  '医德',
  '调查',
  '医学伦理学',
  '中医医疗',
  '建设',
  '探讨',
  '医学人文',
  '医药',
  '医院管理',
  '创新',
  '社区卫生服务',
  '医患沟通',
  '地域分布',
  '住院医师',
  '中医药文化'],
 'year': [2011, 2012, 2013, 2014,