# 将检索模块和文本聚类模块进行封装

In [1]:
import time
import random
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime
from elasticsearch import Elasticsearch

# ES检索模块重构为一个对象

In [2]:
class esPaperRetrieval():
    '''
    根据论文检索需求进行功能的微调
    '''
    _instance = None
    _first_init = True
    def __new__(cls, *args, **kw):
        if not cls._instance:
            cls._instance = super(esPatentRetrieval, cls).__new__(cls)  
        return cls._instance
    
    def __init__(self, host, port):
        '''
        使用ES进行论文检索 指定host、port以及专利index之后进行检索
        '''
        super(esPatentRetrieval, self).__init__()
        self.es = Elasticsearch(hosts=host, port=port, timeout=30, max_retries=10, retry_on_timeout=True)
        self.indexName = 'patent-index'

    def do_search(self, query, fromDate, toDate, volume):
        '''
        do_search方法执行具体检索过程
        query传入检索词 formDate和toDate分别为检索的前后时间
        volume为返回文档数目
        '''
        queryKeywordDict = {"match": {"text": query}
                            }
        queryDateRangeDict = {"range": {
            "publicationDate": {
                "gt": fromDate,
                "lt": toDate
            }
        }
        }
        queryBody = {"query": {"bool": {
            "must": [queryDateRangeDict, queryKeywordDict],
            "must_not": [],
            "should": []
        }},
            "from": 0,
            "size": volume,
            "sort": [],
            "aggs": {}
        }
        result = self.es.search(index=self.indexName, body=queryBody)
        return result

    def format_search(self, result):
        '''
        format_search方法对检索结果进行格式化 构建符合要求的字段进行返回
        输入result为检索结果 提取其中的检索结果进行后处理
        使用ES检索后得到的结果中result['hits']['hits']为数组格式数据
        其中每一个元素为一个dict 对应部分字段
        '''
        docs = result['hits']['hits']
        docs = [i['_source'] for i in docs]
        targetKeyList = 'id pub_id abstractDesc text examiner priorityDocNum agentName title applicationDocNum agentPersonName applicant ipcMain priorityDate inventor assignee publicationDate ipcList applicationDate firstApplicant'
        targetKeyList = targetKeyList.split(' ')
        dict_filter_by_keys = lambda d: {k: d[k] for k in targetKeyList}
        dict_filter_text = lambda d: {k if not k == 'text' else 'claim_text': d[k] for k in d}
        dict_filter_id = lambda d: {k if not k == '_id' else 'id': d[k] for k in d}
        docs = (dict_filter_by_keys(doc) for doc in docs)
        docs = [dict_filter_text(doc) for doc in docs]
        return docs
    def Retrieval(self, query, fromDate, toDate, volume):
        result = self.do_search(query, fromDate, toDate, volume)
        docs = self.format_search(result)
        return docs

In [3]:
es_obj = esPatentRetrieval(host='10.8.128.205',port=29200)

In [4]:
docs = es_obj.Retrieval('机器人','2000-01-06','2010-01-06',volume=5000)

# 客户端对于后端数据接口的访问

In [6]:
import requests

body = {'searchQuery':'机器人',
        'fromDate':'2000-06-02',
        'toDate':'2016-01-01',
        'volume':1000}

baseUrl = 'http://10.8.128.205:29280/Lawbda/dataWare/1.0.0/patent/search'

docs = requests.get(baseUrl,params=body)

# 文本聚类过程封装

In [8]:
import jieba

import pickle

from sklearn.cluster import KMeans

In [9]:
class topicCluster():
    _instance = None
    _first_init = True
    def __new__(cls, *args, **kw):
        if not cls._instance:
            cls._instance = super(topicCluster, cls).__new__(cls)
        return cls._instance
    def __init__(self,):
        # 初始化分词器
        if self._first_init:
            print('init jieba')
            self.tokenizer = lambda x:jieba.lcut(x)
            self.tokenizer('我爱北京天安门')
            # 读取向量化对象
            print('init vectirization')
            self.tfidf_obj = self.load_pickle('../data/patent_data/model/patent_tfidf_model.pkl')
            self.lda_obj = self.load_pickle('../data/patent_data/model/patent_lda_topic9_model.pkl')
            # 读取关键词列表
            print('init kword')
            self.keyword_df = pd.read_pickle('../data/patent_data/processed/claim_kword_df_tr4w.pkl')
            kword_map = {i[2]:i[4] for i in self.keyword_df.itertuples()}
            self.kword_map = {k:kword_map[k].split(' ') for k in kword_map}
            print('done!')
            self._first_init = False
    def data_wash(self,_str):
        '''
        数据清洗对象
        '''
        pattern_num = re.compile('\d{4}\.')
        pattern_num1 = re.compile('\d{1}\.')
        _str = re.sub(pattern_num,'',_str)
        _str = re.sub(pattern_num1,'',_str)
        return _str
    
    def do_cluster(self,texts,n_clusters):
        '''
        文本聚类对象
        '''
        km_obj = KMeans(n_clusters=n_clusters)
        texts = [self.data_wash(i) for i in texts]
        texts = [self.tokenizer(i) for i in texts]
        texts = [' '.join(i) for i in texts]
        tfidf_vec = self.tfidf_obj.transform(texts)
        lda_vec = self.lda_obj.transform(tfidf_vec)
        cluster_result = km_obj.fit_predict(lda_vec)
        return cluster_result
    
    def do_kword(self,ids):
        '''
        由于预先对文本进行关键词抽取 因此只要传入对应文本id即可查表找到关键词
        '''
        kword_list = [self.kword_map[k] for k in ids]
        return kword_list
    
    def do_format(self,cluster_wordtable):
        '''
        用于将输出结果格式化为API要求格式
        '''
        api_result = []
        for topicID in cluster_wordtable:
            temp_dict = {}
            temp_dict['topicID'] = topicID
            temp_dict['topicWords'] = []
            for k in cluster_wordtable[topicID]:
                temp_temp_dict = {}
                temp_temp_dict['word'] = k
                temp_temp_dict['freq'] = float(cluster_wordtable[topicID][k])
                temp_dict['topicWords'].append(temp_temp_dict)
            api_result.append(temp_dict)
        return api_result
    def Cluster(self,ids,texts,n_clusters):
        '''
        主方法 传入文本以及文本对应id 
        输出聚类后文本类别和对应关键词
        如要求聚类数目为10 则输出
        '''
        cluster = self.do_cluster(texts,n_clusters)
        kword = self.do_kword(ids)
        topk = 20
        cluster_wordtable = {}
        for num,i in enumerate(cluster):
            if not i in cluster_wordtable:
                cluster_wordtable[i] = {}
            kword_list = kword[num]
            for word in kword_list:
                if not word in cluster_wordtable[i]:
                    cluster_wordtable[i][word] = 0
                cluster_wordtable[i][word]+=1
            cluster_wordtable[i] = {k:cluster_wordtable[i][k]for k in 
                                    sorted(cluster_wordtable[i],key=lambda x:cluster_wordtable[i][x],reverse=True)[:topk]}
        final_result = self.do_format(cluster_wordtable)
        return final_result
    @staticmethod
    def dump_pickle(obj,fname):
        with open(fname,'wb') as file:
            pickle.dump(obj,file)
    @staticmethod
    def load_pickle(fname):
        return pickle.loads(open(fname,'rb').read())

In [10]:
ppp = topicCluster()

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


init jieba


Loading model cost 0.793 seconds.
Prefix dict has been built succesfully.


init vectirization
init kword
done!


In [11]:
qqq = topicCluster()

In [12]:
ppp == qqq

True

In [55]:
texts = [i['claim_text'] for i in docs.json()]
ids = [i['pub_id'] for i in docs.json()]

In [65]:
cluster = ppp.Cluster(ids,texts,n_clusters=5)

In [25]:
qqq == ppp

True

# 时间测试

In [9]:
fromDate = '2010-01-01'
toDate = '2016-01-01'

In [24]:
ppp = RetrievalMeta()

In [20]:
esPatentSearchObject = esPatentRetrieval(host='10.8.128.205', port=29200)
docs = esPatentSearchObject.Retrieval(query=searchQuery, fromDate=fromDate, toDate=toDate, volume=5000)
texts = [i['claim_text'] for i in docs]
ids = [i['pub_id'] for i in docs]
cluster_obj = topicCluster()
result = cluster_obj.Cluster(ids=ids, texts=texts, n_clusters=topicNum)

AttributeError: 'NoneType' object has no attribute 'Retrieval'

In [None]:
esPatentSearchObject = esPatentRetrieval(host='10.8.128.205', port=29200)
docs = esPatentSearchObject.Retrieval(query=searchQuery, fromDate=fromDate, toDate=toDate, volume=5000)
texts = [i['claim_text'] for i in docs]
ids = [i['pub_id'] for i in docs]
cluster_obj = topicCluster()
result = cluster_obj.Cluster(ids=ids, texts=texts, n_clusters=topicNum)

# 时序聚类过程的封装

In [75]:
pd.__version__

'0.24.2'

In [76]:
import sklearn

In [77]:
sklearn.__version__

'0.19.1'