In [None]:
import pandas as pd
import numpy as np
import json
import os
import csv
import pickle
import jieba
import re,time
from sklearn.decomposition import LatentDirichletAllocation
#导入PCA算法库
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from textrank4zh import TextRank4Keyword, TextRank4Sentence
from sklearn import ensemble
from functools import reduce
from sklearn.metrics import mean_squared_error #均方误差
from sklearn.metrics import mean_absolute_error #平方绝对误差
from sklearn.metrics import r2_score#R square
import math
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn import neighbors
from sklearn import ensemble
from sklearn.externals import joblib

In [2]:
from elasticsearch import Elasticsearch 

In [3]:
def distance(a, b):
    #  计算两个vector的距离，逐项差的平方和，不开根号
    if len(a) != len(b):
        print('wrong')
        return 0
    else:
        temp = a - b
        temp = temp ** 2
        res = sum(temp)
        return res
def get_dist_mat(feat):
    #  输入为特征矩阵，返回一个相似度矩阵（行数乘行数）
    print(type(feat))
    length=feat.shape[0]#样本数量
    wide = feat.shape[1]#特征向量长度
    res = np.zeros(shape=(length, length))
    ave = np.zeros(shape=(length, length, wide))

    for i in range(0, length):
        for j in range(i+1, length):
            ave[i, j] = np.mean(feat[i: j+1], axis=0)
            # print('i = %d, j = %d' % (i, j))

    for i in range(0, length):
        for j in range(i+1, length):
            ave_one = ave[i, j]
            dist_lst = [distance(ave_one, one) for one in feat[i: j+1]]
            res[i, j] = sum(dist_lst)
            # print('i = %d, j = %d' % (i, j))
    return res

def get_class_devide(dist_mat, class_num):
    #  利用递推公式计算分成class_num类别的分割方法
    length = len(dist_mat)
    divided_point = np.zeros(shape=(length, class_num+1))  # 存切割点
    diveded_dist = np.zeros(shape=(length, class_num+1))  # 存切割最小距离

    for i in range(1, length):  # 分成两类
        dist_lst = [dist_mat[0, k] + dist_mat[k+1, i] for k in range(0, i)]
        divided_point[i, 2] = np.argmin(dist_lst)
        diveded_dist[i, 2] = np.min(dist_lst)

    for classes in range(3, class_num+1):   # 分成多类
        for i in range(classes-1, length):
            dist_lst = [diveded_dist[k, classes-1] + dist_mat[k+1, i] for k in range(classes-2, i)]  # 从n-1类到n类
            divided_point[i, classes] = np.argmin(dist_lst) + classes - 2
            diveded_dist[i, classes] = np.min(dist_lst)
    return diveded_dist, divided_point

def get_trace(trace_mat, class_num):
    #  获得切割方法list，
    #  trace_mat就是get_class_devide函数返回的divided_point，分割点记录矩阵
    lst = []
    length = len(trace_mat)  #length=38
    pre = length - 1   #pre=37
    for i in range(class_num, 1, -1):
        temp = int(trace_mat[pre, i])
        pre = temp
        lst.append(temp)
    lst.reverse()
    return lst
def get_one_essay_trace(feature,phrase_num):
    #  整合调用 有序聚类方法
    #  调用就可以获得一个文件的分割结果并存入本地

    time_start = time.time()
    dist_mat = get_dist_mat(feature)
    phrase_num = phrase_num
    min_dist_mat, min_trace_mat = get_class_devide(dist_mat, phrase_num)
    # print(min_trace_mat)

    lst = get_trace(min_trace_mat, phrase_num)
    lst.append(len(feature))
    print('划分结果为：',end='')
    print(lst)
    time_end = time.time()
    print('cost time', time_end - time_start)
    return lst,min_dist_mat,min_trace_mat
def observe_divided(lst, sents):
    # 展示分割结果
    split_result=[]
    start = 0
    for end in lst:
        cla_lst=[]
        if end==len(sents):
            end=end-1
        for i in range(start, end+1):
            cla_lst.append(sents[i])
        start = end + 1
        split_result.append(cla_lst)
    return split_result

In [4]:
class esPaperRetrieval():
    '''
    根据论文检索需求进行功能的微调
    '''
    _instance = None
    _first_init = True
    def __new__(cls, *args, **kw):
        if not cls._instance:
            cls._instance = super(esPaperRetrieval, cls).__new__(cls)  
        return cls._instance
    
    def __init__(self, host, port):
        '''
        使用ES进行论文检索 指定host、port以及专利index之后进行检索
        '''
        super(esPaperRetrieval, self).__init__()
        self.es = Elasticsearch(hosts=host, port=port, timeout=30, max_retries=10, retry_on_timeout=True)
        self.indexName = 'paper-detail-index'

    def do_search(self, titleQuery, kwordQuery, summaryQuery ,pubQuery,fromDate, toDate, volume):
        '''
        do_search方法执行具体检索过程
        titleQuery 为对应标题检索词
        kwordQuery 为对应关键字检索词
        summaryQuery 为对应摘要检索词
        上述三个词为or模式 可以出现 可以不出现
        pubQuery 为对应文摘关键词 这个关键词必定匹配
        fromDate 为检索字段起始日期 toDate为终结日期 日期格式 yyyy-mm-dd
        volume为每次检索返回的数目
        '''
        queryBody = {
          "query": {
            "bool": {
              "should": [
                {
                  "term": {
                    "P_Title": titleQuery
                  }
                },
                {
                  "term": {
                    "P_Keyword": kwordQuery
                  }
                },
                {
                  "term": {
                    "P_Summary": summaryQuery
                  }
                },
                  {
                  "term": {
                    "P_Publication.keyword": {"value":pubQuery}
                  }
                  }
              ],
              "filter": [
                {
                  "range": {
                    "P_year": {
                      "gt": fromDate,
                      "lt": toDate
                    }
                  }
                },
              ]
            }
          },
            "from": 0,
            "size": volume,
            "sort": [],
            "aggs": {}
        }
#         print(queryBody)
        result = self.es.search(index=self.indexName, body=queryBody)
        return result

    def format_search(self, result):
        '''
        format_search方法对检索结果进行格式化 构建符合要求的字段进行返回
        输入result为检索结果 提取其中的检索结果进行后处理
        使用ES检索后得到的结果中result['hits']['hits']为数组格式数据
        其中每一个元素为一个dict 对应部分字段
        '''
        docs = result['hits']['hits']
        docs = [i['_source'] for i in docs]
        targetKeyList = 'P_ID, P_Title, P_Author, P_Publication, P_Organ, P_year, P_Keyword, \
    P_Summary, P_Keyword_seg, P_Title_seg,\
    P_Summary_seg, P_URL, P_Fields, P_Fields_two,P_References, P_Pagecount, P_Page, P_Language,\
    P_Download_num, P_Citation_num,P_Vector,P_Volume, P_Issue,P_Issn,P_Isbn, P_Doi,\
    P_Red1, P_Red2, P_Red3, P_Red4, P_Red5'
        targetKeyList = [i.strip() for i in targetKeyList.split(',')]
        dict_filter_by_keys = lambda d: {k: d[k] for k in targetKeyList}
        dict_filter_text = lambda d: {k if not k == 'text' else 'claim_text': d[k] for k in d}
        dict_filter_id = lambda d: {k if not k == '_id' else 'id': d[k] for k in d}
        docs = (dict_filter_by_keys(doc) for doc in docs)
        docs = [dict_filter_text(doc) for doc in docs]
        return docs
    def Retrieval(self, titleQuery, kwordQuery, summaryQuery ,pubQuery,fromDate, toDate, volume):
        result = self.do_search(titleQuery, kwordQuery, summaryQuery ,pubQuery,fromDate, toDate, volume)
        docs = self.format_search(result)
        return docs

In [5]:
class pre_word_count():
    def __init__(self,data_path,model_path):
        self.data = pd.read_json(data_path)
        self.data['authors_seg'] = self.data.apply(lambda r: r['P_Red1'] + r['P_Red3'],axis=1)
        self.data['year'] = self.data['P_year'].apply(lambda x:x[:4])
        self.model_path = model_path
    # 统计出现在句子中的词频
    def tf(word,sentence):
        c=0
        s=sentence.split()
        for i in s:
            if word==i:
                c+=1
        return c
        
    #建立tfidf表
    def build_tfidf(max_feature,data):
        tfidf=TfidfVectorizer(max_features=max_feature)
        f_tfidf=tfidf.fit_transform(data)
        f_tfidf=f_tfidf.toarray()
        return f_tfidf
        
    # 统计非重复的所有作者
    def tongjiau(self,li):
        authors=[]
        for i in li:
            try:
                authors+=i.split()
            except:
                continue
        return list(set(authors))
        
    # 判断新作者是否在论文中提及热点词
    def mention(self,authors,word,sentences,senten_auth):
        i=0
        for a in authors:
            for j in range(len(sentences)):
                try:
                    if len(list(set(a).intersection(set(senten_auth[j]))))>0 and word in sentences[j]:
                        i+=1
                except:
                    continue
        return i
        
    # 统计某个热点词在某年的跨学科提及数
    def multi_journal(word,sentences,journals):
        men_journal=[]
        for j in range(len(sentences)):
            try:
                if word in sentences[j]:
                    men_journal.append(journals[j])
            except:
                continue
        s=set(men_journal)
        if None in s:
            s.remove(None)
        return len(s)
        
    # 抽取词频特征
    def feature(word,year_df):
        abstract_tf=[]
        keyword_tf=[]
        for y in list(year_df['seg_abstract']):
            abstract_tf.append(tf(word,y))
        for i in list(year_df['seg_keywords']):
            keyword_tf.append(tf(word,i))
        return abstract_tf,keyword_tf
        
    #计算list的平均数

    def pingjun(L):

        a=reduce(lambda x,y:x+y,L)

        return int(a/len(L))
    # 获取热点词
    def get_hotwords(self,hotword_num):
        guanjian=list(map(lambda x:' '.join(x.split()),list(self.data['P_Keyword_seg'])))
        merge=' '.join(guanjian)
        tr4w = TextRank4Keyword()
        tr4w.analyze(text=merge, window=4)
        pred_words=[]
        for item in tr4w.get_keywords(hotword_num, word_min_len=2):
            pred_words.append(item.word)
        return pred_words
    # 统计每年发表的论文数
    def year_paper_num(self,):
        year_num=[]
        years=list(set(self.data['year']))
        group_df=self.data.groupby(by=['year'])
        # years.remove(None)
        years=sorted([int(i) for i in years])
        for y in years:
            try:
                year_num.append(len(group_df.get_group(str(y))))
            except:
                continue
        return group_df,years,year_num
    # 按年份将摘要整合
    def groupby_year(self,group_df,years,year_num):
        years_df=pd.DataFrame()
        years_df['year']=years
        seg,key,tit,jou,authors=[],[],[],[],[]

        for i in years:
            seg.append(' '.join(group_df.get_group(str(i))['P_Summary_seg']))
            k=group_df.get_group(str(i))['P_Keyword_seg']
            key.append(' '.join(k))
            tit.append(' '.join(group_df.get_group(str(i))['P_Title_seg']))
            jou.append(' '.join(group_df.get_group(str(i))['P_Publication']))
            authors.append(self.tongjiau(group_df.get_group(str(i))['authors_seg']))
        years_df['seg_abstract'] = seg
        years_df['seg_keywords'] = key
        years_df['seg_title'] = tit
        years_df['seg_authors'] = authors
        years_df['Jounal'] = jou
        years_df['paper_num'] = year_num
        return years_df
    # 统计每年的新作者    
    def year_new_author(self,years_df):
        authors = list(years_df['seg_authors'])
        new_author = [authors[0]]
        for i in range(1,len(years_df)):
            new=[]
            for j in authors[i]:
                if j not in authors[i-1]:
                    new.append(j)
            new_author.append(new)
        return new_author
    # 统计每年新作者提及数
    def year_new_author_mention(self,pred_words,new_author,years):
        newauthor_mens=[]
        group_df=self.data.groupby(by=['year'])
        for word in pred_words:
            for i in range(len(new_author)):
                newauthor_men=self.mention(new_author,word,list(group_df.get_group(str(years[i]))['P_Summary']),list(map(lambda x:x.split(),list(group_df.get_group(str(years[i]))['seg_authors']))))
                newauthor_mens.append(newauthor_men)  
        return newauthor_mens
    # 统计跨学科提及数
    def multi_sub(self,pred_words):
        multi_mens = []
        group_df = self.data.groupby(by=['year'])
        for word in pred_words:
            for i in range(len(years)):
                journals=multi_journal(word,list(group_df.get_group(str(years[i]))['P_Summary']),list(group_df.get_group(str(years[i]))['Jounal']))
                multi_mens.append(journals)
        return multi_mens
    # 统计每个词初次在关键词中出现的年份
    def first_year(self,pred_words,years_df):
        f_year=[]
        pp=[]
        for w in range(len(pred_words)):
            for y,i in zip(list(years_df['year']),list(years_df['seg_keywords'])):
                if pred_words[w] in i:
                    f_year.append(y)
                    pp.append(w)
                    break
        return f_year
    #获取每个词18年的特征
    def get_feature(self,years_df,pred_words,f_year,newauthor_mens,multi_mens):
        year_num=len(years_df)
        pp=pd.DataFrame()
        for i in range(len(pred_words)):
            word=pred_words[i]
            fyear=f_year[i]
            exist_year=[]
            for n,f in zip(list(years_df['year']),[fyear]*year_num):
                e=n-f
                if e<0:
                    exist_year.append(0)
                else:
                    exist_year.append(e)
            dd=pd.DataFrame()
            dd['word']=[word]*year_num
            dd['f1']=feature(word,years_df)[0]
            dd['f2']=feature(word,years_df)[1]
            dd['f3'] = exist_year
            dd['f4'] = newauthor_mens[i*year_num:i*year_num+(year_num-1)]+[pingjun(newauthor_mens[i*year_num+(year_num-1)-3:i*year_num+(year_num-1)])]
            dd['f6'] = multi_mens[i*year_num:i*year_num+(year_num-1)]+[pingjun(multi_mens[i*year_num+(year_num-1)-3:i*year_num+(year_num-1)])]
            if i==0:
                pp=dd
            else:
                pp=pp.append(dd)
            pp=pp.reset_index(drop=True)
            return pp
    #预测某个词的词频
    def zhidingword(search_word,test_data,model):
        pre_data=test_data.loc[test_data.word==search_word].iloc[:,1:-1]
        result = model.predict(pre_data)
        return int(result)
    #返回结果
    def pre_result(self):
        model_GBR=joblib.load(self.model_path)
        pred_words=self.get_hotwords(100)
        years=self.year_paper_num()[1]
        group_df=self.year_paper_num()[0]
        year_num=self.year_paper_num()[2]
        years_df=self.groupby_year(group_df,years,year_num)
        f_year=self.first_year(pred_words,years_df)
        new_author=self.year_new_author(years_df)
        newauthor_mens=self.year_new_author_mention(pred_words,new_author,years)
        multi_mens=self.multi_sub(pred_words)
        pp=self.get_feature(years_df,pred_words,f_year,newauthor_mens,multi_mens)
        for word in pred_words:
            result=self.zhidingword(word,test_data,model_GBR)
            search_ke=list(pp.loc[pp.word==word]['f1'])+[result]
        result=pd.DataFrame()
        result['word']=pred_words
        result['year']=years
        result['count']=search_ke
        result=result.to_json(orient='index')
        return result

In [6]:
esPaperObj = esPaperRetrieval(host='10.8.128.205',port=49200)

In [7]:
data = esPaperObj.Retrieval(titleQuery='',kwordQuery='',
                     summaryQuery='中医',pubQuery='',fromDate='2001-01-01',toDate='2018-01-01',volume=100)

In [9]:
aaa = pre_word_count(json.dumps(data), '../data/paper_data/paper_hotpoint.model')


In [10]:
aaa.pre_result() 

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.254 seconds.
Prefix dict has been built succesfully.


KeyError: 'seg_authors'