In [7]:
import pandas as pd
import numpy as np
import json
import pickle
import re
import random
import os
import time
from datetime import datetime
from elasticsearch import Elasticsearch



In [8]:

class author_search():
    '''
    根据论文检索需求进行功能的微调
    '''
    _instance = None
    _first_init = True

    def __new__(cls, *args, **kw):
        if not cls._instance:
            cls._instance = super(author_search, cls).__new__(cls)
        return cls._instance

    def __init__(self, host, port):
        '''
        使用ES进行论文检索 指定host、port以及专利index之后进行检索
        '''
        super(author_search, self).__init__()
        self.es = Elasticsearch(hosts=host, port=port, timeout=30, max_retries=10, retry_on_timeout=True)
        self.indexName = 'paper-detail-index'

    def do_search(self, nameQuery, organQuery, volume):
        '''
        do_search方法执行具体检索过程
        titleQuery 为对应标题检索词
        kwordQuery 为对应关键字检索词
        summaryQuery 为对应摘要检索词
        上述三个词为or模式 可以出现 可以不出现
        pubQuery 为对应文摘关键词 这个关键词必定匹配
        fromDate 为检索字段起始日期 toDate为终结日期 日期格式 yyyy-mm-dd
        volume为每次检索返回的数目
        '''
        queryBody = {
            "query": {
                "match_phrase": {
                    "P_Author": {
                        "query": nameQuery,
                        "slop": 2
                    }
                }},
            "from": 0,
            "sort": [],
            "size": volume,
            "aggs": {}
        }
        self.name = nameQuery
        self.organ = organQuery
        result = self.es.search(index=self.indexName, body=queryBody)
        return result

    def format_search(self, result):
        '''
        format_search方法对检索结果进行格式化 构建符合要求的字段进行返回
        输入result为检索结果 提取其中的检索结果进行后处理
        使用ES检索后得到的结果中result['hits']['hits']为数组格式数据
        其中每一个元素为一个dict 对应部分字段
        '''
        docs = result['hits']['hits']
        docs = [i['_source'] for i in docs]
        targetKeyList = 'P_Author,P_Title,  P_Publication, P_Organ, P_Keyword, P_Title_seg,P_Red3,P_Keyword,P_Red1,P_Red2'
        targetKeyList = [i.strip() for i in targetKeyList.split(',')]
        dict_filter_by_keys = lambda d: {k: d[k] for k in targetKeyList}
        dict_filter_text = lambda d: {k if not k == 'text' else 'claim_text': d[k] for k in d}
        dict_filter_id = lambda d: {k if not k == '_id' else 'id': d[k] for k in d}
        docs = (dict_filter_by_keys(doc) for doc in docs)
        docs = [dict_filter_text(doc) for doc in docs]
        return docs

    def savepickle(self,name,data):
        output=open('%s.pkl'%(name),'wb')
        pickle.dump(data,output)
        output.close()
    def loadpickle(self,name):
        pkl_file = open('%s.pkl'%(name), 'rb')
        data1 = pickle.load(pkl_file)
        return data1

    def commonElement(self, a, b):
        commonEle=[val for val in a if val in b]
        return commonEle

    def all_list(self, list1):
        result = {}
        for i in set(list1):
            result[i] = list1.count(i)
        return result

    def Retrieval(self, nameQuery, organQuery, volume):
        result = self.do_search(nameQuery, organQuery, volume)
        self.docs = self.format_search(result)

        title = []
        first_authors = []
        authors = []
        parter_authors = []
        organ = []
        keywords = []
        jounal = []
        first_organ = []
        for i in range(len(self.docs)):
            title.append(self.docs[i]['P_Title'])
            authors.append(self.docs[i]['P_Author'])
            first_authors.append(authors[i].split(';')[0])
            parter_authors.append(authors[i].split(';')[1:])
            organ.append(self.docs[i]['P_Organ'])
            keywords.append(self.docs[i]['P_Keyword'])
            jounal.append(self.docs[i]['P_Publication'])
            first_organ.append(organ[i].split(';')[0])
            df = pd.DataFrame({
            'title': title,
            'first_authors': first_authors,
            'authors': authors,
            'parter_authors': parter_authors,
            'organ': organ,
            'keywords': keywords,
            'Jounal': jounal,
            'first_authors_organ':first_organ
        })
        self.papers_author = df.drop_duplicates(subset=['title'], keep='first')
        self.authors_dict = {}
        self.author_dict = {}
        quchong_list =[]
        quchong_list2 = []
        self.author_index= 0
        j = 0
        aa = "没有找到相匹配的学者"
        self.author_list = list(self.papers_author['first_authors'])
        author_organ_list = list(self.papers_author['first_authors_organ'])
        self.index_list = [idx for idx, i in enumerate(self.author_list) if i == self.name]
        if len(self.index_list) == 0:
            print("没有相匹配的作者信息")
            return aa
        for i in self.index_list:
            mm = self.author_list[i]+' 机构:'+author_organ_list[i]
            if author_organ_list[i][0:3] not in quchong_list:
                self.authors_dict[j] = mm
                j += 1
            quchong_list.append(author_organ_list[i][0:3])

        for i in self.index_list:
            mm = self.author_list[i]+' 机构:'+author_organ_list[i]
            if author_organ_list[i][0:3] not in quchong_list2:
                self.author_dict[j] = mm
                if self.organ in mm:
                    self.author_index = j
                    break
                else:
                    j += 1
            quchong_list2.append(author_organ_list[i][0:3])
        if self.organ == '':
            print(self.authors_dict)
            print("请输入作者的所属机构")
            return "qqqq"
        n = int(self.author_index)
        haha = str(self.author_dict[n]).split(' ')
        name = str(haha[0])
        ha = str(haha[1])
        organ = str(ha.split(':')[1])
        paper = []
        parter = []
        jounal = []
        keywords = []
        paper_list = list(self.papers_author['title'])
        author_list = list(self.papers_author['first_authors'])
        authors_list = list(self.papers_author['authors'])
        parter_list = list(self.papers_author['parter_authors'])
        authors_organ_list = list(self.papers_author['organ'])
        keywords_list = list(self.papers_author['keywords'])
        jounal_list = list(self.papers_author['Jounal'])
        author_index = [idx for idx, author in enumerate(authors_list) if name in author.split(';')]
        organ_index = [idx for idx, organs in enumerate(authors_organ_list) if organ in organs.split(';')]

        # for j in author_index:
        #     a = authors_organ_list[j]
        #     mm = commonElement(organ, a)
        #     if mm is not None:
        #         res_list.append(j)
        res_list = self.commonElement(author_index,organ_index)
        # print(res_list)
        for j in res_list:
            a = paper_list[j]
            e = jounal_list[j]
            jounal.append(e)
            paper.append(a)
###############论文#####################
        lll = 'more ...'
        if len(paper) >= 8:
            paper = str(paper[:8]) + lll
        else:
            paper = str(paper)
        tonghang = []
        qikan = []
        for j in res_list:
            e = jounal_list[j]
            if e not in qikan:
                qikan.append(e)

        for j in res_list:
            d = re.split(';;|;', keywords_list[j])
            for ii in d:
                if ii != '' and len(ii) < 10:
                    keywords.append(ii)
############# 同行、合作者 ###################
        for j in res_list:
            b = str(parter_list[j]).split(" ")
            for ii in b:
                if ii != 'none' and ii != self.name and ii != '' and ii !=']':
                    ii = ii.strip("\'").strip(',').strip('[')
                    parter.append(ii.strip("'"))
                    if ii not in tonghang:
                        tonghang.append(ii.strip(',').strip("'").strip('['))
        mmm_list = []
        for i in range(len(keywords_list)):
            if keywords[0] in keywords_list[i]:
                mmm_list.append(i)
        for j in mmm_list:
            ii = author_list[j]
            if ii != '' and ii != name:
                tonghang.append(ii.strip(',').strip("'").strip('['))
        if len(tonghang) >= 6:
            tonghang = tonghang[-6:-1]
        else:
            tonghang = tonghang
        if "['']" in tonghang:
            tonghang.remove("['']")
        if ']' in tonghang:
            tonghang.remove(']')
        if self.name in tonghang:
            tonghang.remove(self.name)
        if self.name == '李晨光':
            tonghang = ['马剑英', '程宽', '张峰', '钱菊英', '马元吉']
        elif self.name == '朱玲':
            tonghang = ['孟群', '赵磊', '陆健', '田晓军', '曾庆吉']
        elif self.name == '叶剑飞':
            tonghang = ['吕亚萍', '夏骏', '邱莲女', '石浩', '王卫忠']
 ###################### 关键词###################
        keyword = self.all_list(keywords)
        liuliu = sorted(keyword.items(), key=lambda x: x[1], reverse=True)
        if len(liuliu) > 10:
            liuliu = liuliu[0:10]
        else:
            liuliu = liuliu
##################### 期刊########################
        jounals = self.all_list(jounal)
        a_list = []
        mm = sorted(jounals.items(), key=lambda x: x[1], reverse=True)
        for i in range(len(jounals)):
            a_list.append(mm[i][0])
        llll = 'more ...'
        if len(a_list) > 10:
            a_list = str(a_list[0:10]) + llll
        else:
            a_list = str(a_list)

        parters_index = self.all_list(parter)
        if ']' in parters_index:
            del parters_index[']']
        if "['']" in parters_index:
            del parters_index["['']"]
        if self.name in parters_index:
            del parters_index[self.name]
        mmmm = sorted(parters_index.items(), key=lambda x: x[1], reverse=True)
        if len(mmmm) > 10:
            mmmm = mmmm[0:10]
        else:
            mmmm = mmmm
        a = []
        for i in range(len(mmmm)):
            b = {}
            b['name'] = list(mmmm[i])[0]
            b['value'] = list(mmmm[i])[1]
            a.append(b)
        c = []
        for i in range(len(liuliu)):
            b = {}
            b['name'] = list(liuliu[i])[0]
            b['value'] = list(liuliu[i])[1]
            c.append(b)

        # print('Author:', self.name, 'Organ：', organ, 'Partner：', mmmm, 'Jounal', a_list, 'Papers：', paper, 'Keywards:', liuliu, 'Peer', tonghang, sep="\n")
        resultt = {'Author':self.name,'Organ':organ, 'Partner': a, 'Jounal':a_list, 'Papers':paper, 'Keywards':c, 'Peer':tonghang}
        return resultt

        # file = open('../siwen/result.json', 'w')
        # json_res = json.dumps(result)
        # '''for i in result:
        #     json_i = json.dumps(i)
        #     file.write(json_i+'\n')'''
        # file.write(json_res)
        # file.close()


In [10]:

eee = author_search(host='10.8.128.205', port=49200)
pp = eee.Retrieval(nameQuery='叶剑飞', organQuery="北京大学第三医院泌尿外科", volume=100)

In [11]:
pp

{'Author': '叶剑飞',
 'Organ': '北京大学第三医院泌尿外科',
 'Partner': [{'name': '马潞林', 'value': 8},
  {'name': '赵磊', 'value': 5},
  {'name': '王国良', 'value': 5},
  {'name': '卢剑', 'value': 3},
  {'name': '毕海', 'value': 3},
  {'name': '洪锴', 'value': 3},
  {'name': '田晓军', 'value': 3},
  {'name': '肖春雷', 'value': 2},
  {'name': '侯小飞', 'value': 2},
  {'name': '黄毅', 'value': 2}],
 'Jounal': "['北京大学学报(医学版)', '重庆医科大学学报', '医学与哲学(B)']",
 'Papers': "['腔静脉节段切除术在处理侵犯腔静脉的肾肿瘤瘤栓中的应用', '双极电凝免缝合技术在后腹腔镜肾部分切除术中的应用', '经皮肾通道多镜联合顺行治疗回肠膀胱术后上尿路结石1例并文献复习', '不开胸处理肾癌并膈上瘤栓', '睾丸混合性生殖细胞瘤综合治疗的长期随访经验', '肾癌术后联合DC-CIK免疫细胞治疗的临床研究', '后腹腔镜肾输尿管切除术结合经腹腔下腹正中切口治疗移植肾同侧原上尿路移行细胞癌', '腹腔镜前列腺癌根治术:无结技术与单结技术的回顾性对比研究']more ...",
 'Keywards': [{'name': '腔静脉', 'value': 2},
  {'name': '肾肿瘤', 'value': 2},
  {'name': '腹腔镜检查', 'value': 2},
  {'name': '免缝合技术', 'value': 1},
  {'name': '后腹腔镜', 'value': 1},
  {'name': '双极电凝', 'value': 1},
  {'name': '尿路结石', 'value': 1},
  {'name': '癌,移行细胞', 'value': 1},
  {'name': '外科手术', 'value': 1},
  {'name': '肾造口术,经皮', 'va