In [6]:
import pandas as pd
import numpy as np
import json
import pickle
import re
import random
import os
import time
from datetime import datetime
from elasticsearch import Elasticsearch


In [12]:
class author_search():
    '''
    根据论文检索需求进行功能的微调
    '''
    _instance = None
    _first_init = True

    def __new__(cls, *args, **kw):
        if not cls._instance:
            cls._instance = super(author_search, cls).__new__(cls)
        return cls._instance

    def __init__(self, host, port):
        '''
        使用ES进行论文检索 指定host、port以及专利index之后进行检索
        '''
        super(author_search, self).__init__()
        self.es = Elasticsearch(hosts=host, port=port, timeout=30, max_retries=10, retry_on_timeout=True)
        self.indexName = 'paper-detail-index'

    def do_search(self, nameQuery, organQuery, volume):
        '''
        do_search方法执行具体检索过程
        titleQuery 为对应标题检索词
        kwordQuery 为对应关键字检索词
        summaryQuery 为对应摘要检索词
        上述三个词为or模式 可以出现 可以不出现
        pubQuery 为对应文摘关键词 这个关键词必定匹配
        fromDate 为检索字段起始日期 toDate为终结日期 日期格式 yyyy-mm-dd
        volume为每次检索返回的数目
        '''
        queryBody = {
            "query": {
                "match_phrase": {
                    "P_Author": {
                        "query": nameQuery,
                        "slop": 2
                    }
                }},
            "from": 0,
            "sort": [],
            "size": volume,
            "aggs": {}
        }
        self.name = nameQuery
        self.organ = organQuery
        result = self.es.search(index=self.indexName, body=queryBody)
        return result

    def format_search(self, result):
        '''
        format_search方法对检索结果进行格式化 构建符合要求的字段进行返回
        输入result为检索结果 提取其中的检索结果进行后处理
        使用ES检索后得到的结果中result['hits']['hits']为数组格式数据
        其中每一个元素为一个dict 对应部分字段
        '''
        docs = result['hits']['hits']
        docs = [i['_source'] for i in docs]
        targetKeyList = 'P_Author,P_Title,  P_Publication, P_Organ, P_Keyword, P_Title_seg,P_Red3,P_Keyword,P_Red1,P_Red2'
        targetKeyList = [i.strip() for i in targetKeyList.split(',')]
        dict_filter_by_keys = lambda d: {k: d[k] for k in targetKeyList}
        dict_filter_text = lambda d: {k if not k == 'text' else 'claim_text': d[k] for k in d}
        dict_filter_id = lambda d: {k if not k == '_id' else 'id': d[k] for k in d}
        docs = (dict_filter_by_keys(doc) for doc in docs)
        docs = [dict_filter_text(doc) for doc in docs]
        return docs

    def Retrieval(self, nameQuery, organQuery, volume):
        result = self.do_search(nameQuery, organQuery, volume)
        self.docs = self.format_search(result)
        return self.result()
        
    def result(self,):
        title = []
        first_authors = []
        authors = []
        parter_authors = []
        organ = []
        keywords = []
        jounal = []
        first_organ = []
        for i in range(len(self.docs)):
            title.append(self.docs[i]['P_Title'])
            authors.append(self.docs[i]['P_Author'])
            first_authors.append(authors[i].split(';')[0])
            parter_authors.append(authors[i].split(';')[1:])
            organ.append(self.docs[i]['P_Organ'])
            keywords.append(self.docs[i]['P_Keyword'])
            jounal.append(self.docs[i]['P_Publication'])
            first_organ.append(organ[i].split(';')[0])
            df = pd.DataFrame({
            'title': title,
            'first_authors': first_authors,
            'authors': authors,
            'parter_authors': parter_authors,
            'organ': organ,
            'keywords': keywords,
            'Journal': jounal,
            'first_authors_organ':first_organ
        })
        self.papers_author = df.drop_duplicates(subset=['title'], keep='first')
        return self.search()
    def savepickle(self,name,data):
        output=open('%s.pkl'%(name),'wb')
        pickle.dump(data,output)
        output.close()
    def loadpickle(self,name):
        pkl_file = open('%s.pkl'%(name), 'rb')
        data1 = pickle.load(pkl_file)
        return data1

    def commonElement(self, a, b):
        commonEle=[val for val in a if val in b]
        return commonEle

    def all_list(self, list1):
        result = {}
        for i in set(list1):
            result[i] = list1.count(i)
        return result

    def search(self,):
        self.authors_dict = {}
        self.author_dict = {}
        quchong_list =[]
        quchong_list2 = []
        self.author_index= 0
        j = 0
        aa = "没有找到相匹配的学者"
        self.author_list = list(self.papers_author['first_authors'])
        author_organ_list = list(self.papers_author['first_authors_organ'])
        self.index_list = [idx for idx, i in enumerate(self.author_list) if i == self.name]
        if len(self.index_list) == 0:
            print("没有相匹配的作者信息")
            return aa
        for i in self.index_list:
            mm = self.author_list[i]+' 机构:'+author_organ_list[i]
            if author_organ_list[i][0:3] not in quchong_list:
                self.authors_dict[j] = mm
                j += 1
            quchong_list.append(author_organ_list[i][0:3])

        for i in self.index_list:
            mm = self.author_list[i]+' 机构:'+author_organ_list[i]
            if author_organ_list[i][0:3] not in quchong_list2:
                self.author_dict[j] = mm
                if self.organ in mm:
                    self.author_index = j
                    break
                else:
                    j += 1
            quchong_list2.append(author_organ_list[i][0:3])
        return self.author()

    def author(self):
        if self.organ == '':
            print(self.authors_dict)
            print("请输入作者的所属机构")
            return "qqqq"
        n = int(self.author_index)
        haha = str(self.author_dict[n]).split(' ')
        name = str(haha[0])
        ha = str(haha[1])
        organ = str(ha.split(':')[1])
        paper = []
        parter = []
        jounal = []
        keywords = []
        paper_list = list(self.papers_author['title'])
        author_list = list(self.papers_author['first_authors'])
        authors_list = list(self.papers_author['authors'])
        parter_list = list(self.papers_author['parter_authors'])
        authors_organ_list = list(self.papers_author['organ'])
        keywords_list = list(self.papers_author['keywords'])
        jounal_list = list(self.papers_author['Journal'])
        author_index = [idx for idx, author in enumerate(authors_list) if name in author.split(';')]
        organ_index = [idx for idx, organs in enumerate(authors_organ_list) if organ in organs.split(';')]
        res_list = self.commonElement(author_index,organ_index)
        for j in res_list:
            a = paper_list[j]
            e = jounal_list[j]
            jounal.append(e)
            paper.append(a)
        tonghang = []
        qikan = []
        for j in res_list:
            e = jounal_list[j]
            if e not in qikan:
                qikan.append(e)

        for j in res_list:
            d = re.split(';;|;', keywords_list[j])
            for ii in d:
                if ii != '' and len(ii) < 10:
                    keywords.append(ii)
    # 同行、合作者
        for j in res_list:
            b = str(parter_list[j]).split(" ")
            for ii in b:
                if ii != 'none' and ii != name and ii != '' and ii !=']':
                    ii = ii.strip("\'").strip(',').strip('[')
                    parter.append(ii.strip("'"))
                    if ii not in tonghang:
                        tonghang.append(ii.strip(',').strip("'").strip('['))
        mmm_list = []
        for i in range(len(keywords_list)):
            if keywords[0] in keywords_list[i]:
                mmm_list.append(i)
        for j in mmm_list:
            ii = author_list[j]
            if ii != '' and ii != name:
                tonghang.append(ii.strip(',').strip("'").strip('['))
        if len(tonghang) >= 6:
            tonghang = tonghang[-6:-1]
        else:
            tonghang = tonghang
        if "['']" in tonghang:
            tonghang.remove("['']")
        if ']' in tonghang:
            tonghang.remove(']')
    # 关键词
        keyword = self.all_list(keywords)
        keyword_s = []
        for j in keyword.keys():
            keyword_s.append(j)
        if len(keyword_s) > 10:
            keyword_s = keyword_s[0:10]
        else:
            keyword_s = keyword_s
    # 期刊
        jounals = self.all_list(jounal)
        a_list = []
        mm = sorted(jounals.items(), key=lambda x: x[1], reverse=True)
        for i in range(len(jounals)):
            a_list.append(mm[i][0])
        if len(a_list) > 3:
            a_list = a_list[0:3]
        else:
            a_list = a_list
        parters_index = self.all_list(parter)
        if ']' in parters_index:
            del parters_index[']']
        if "['']" in parters_index:
            del parters_index["['']"]
        mmmm = sorted(parters_index.items(), key=lambda x: x[1], reverse=True)
        result = {'Author':self.name,'Organ':organ, 'Partner':mmmm, 
                  'Journal':a_list, 'Papers': paper, 'Keywords':keyword_s, 'Peer':tonghang}
        return result

In [13]:
eee = author_search(host='10.8.128.205', port=49200)

In [14]:
result = eee.Retrieval(nameQuery='李晨光', organQuery="哈尔滨医科大学附属第四医院微创神经外科", volume=100)

In [15]:
result

{'Author': '李晨光',
 'Organ': '哈尔滨医科大学附属第四医院微创神经外科',
 'Partner': [('王智', 2),
  ('张伟光', 2),
  ('林朋', 1),
  ('王来藏', 1),
  ('赵景波', 1),
  ('王超', 1),
  ('李晨光', 1),
  ('曹绍东', 1),
  ('秦海燕', 1)],
 'Journal': ['中国微侵袭神经外科杂志'],
 'Papers': ['腰椎间盘突出症的显微手术治疗', '320排CT血管造影检测颅内动脉瘤的临床研究'],
 'Keywords': ['脑血管造影术', '椎间盘移位', '显微外科手术', '颅内动脉瘤', '诊断', '腰椎'],
 'Peer': ['张伟光', '赵景波', '秦海燕', '曹绍东', '林朋']}

In [16]:
result['Partner'] = [{'name':i[0],'value':i[1]} for i in result['Partner']]

convert_keys = ['Journal','Papers','Keywords','Peer']

for k in convert_keys:
    result[k] = [{'value':i} for i in result[k]]

In [None]:
default = {'Author':'','Organ':organ, 'Partner':mmmm, 
                  'Journal':a_list, 'Papers': paper, 'Keywords':keyword_s, 'Peer':tonghang}

In [17]:
result

{'Author': '李晨光',
 'Organ': '哈尔滨医科大学附属第四医院微创神经外科',
 'Partner': [{'name': '王智', 'value': 2},
  {'name': '张伟光', 'value': 2},
  {'name': '林朋', 'value': 1},
  {'name': '王来藏', 'value': 1},
  {'name': '赵景波', 'value': 1},
  {'name': '王超', 'value': 1},
  {'name': '李晨光', 'value': 1},
  {'name': '曹绍东', 'value': 1},
  {'name': '秦海燕', 'value': 1}],
 'Journal': [{'value': '中国微侵袭神经外科杂志'}],
 'Papers': [{'value': '腰椎间盘突出症的显微手术治疗'}, {'value': '320排CT血管造影检测颅内动脉瘤的临床研究'}],
 'Keywords': [{'value': '脑血管造影术'},
  {'value': '椎间盘移位'},
  {'value': '显微外科手术'},
  {'value': '颅内动脉瘤'},
  {'value': '诊断'},
  {'value': '腰椎'}],
 'Peer': [{'value': '张伟光'},
  {'value': '赵景波'},
  {'value': '秦海燕'},
  {'value': '曹绍东'},
  {'value': '林朋'}]}