# 将检索模块和文本聚类模块进行封装

In [3]:
import time
import random
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime
from elasticsearch import Elasticsearch

In [4]:
import mysql.connector

# ES检索模块重构为一个对象

In [41]:
class esNewsRetrieval():
    '''
    根据论文检索需求进行功能的微调
    '''
    _instance = None
    _first_init = True
    def __new__(cls, *args, **kw):
        if not cls._instance:
            cls._instance = super(esNewsRetrieval, cls).__new__(cls)  
        return cls._instance
    
    def __init__(self, host, port):
        '''
        使用ES进行论文检索 指定host、port以及专利index之后进行检索
        '''
        super(esNewsRetrieval, self).__init__()
        self.es = Elasticsearch(hosts=host, port=port, timeout=30, max_retries=10, retry_on_timeout=True)
        self.indexName = 'news-gdelt-index'

    def do_search(self, Actor1Geo_FullName, Actor2Geo_FullName, ActionGeo_FullName,MediaGeo,Title,Content,volume):
        '''
        do_search方法执行具体检索过程
        wordQuery 本应为查询对应微博时所用的检索词 此处暂时不用 目前暂时只检索微博评论数据后续再修改对象为
        
        volume为每次检索返回的数目
        '''
        queryBody = {
            "query": {
            "bool": {
            "must": [ ],
            "must_not": [ ],
            "should": [
            {
            "term": {
            "Actor1Geo_FullName.keyword": Actor1Geo_FullName
            }
            }
            ,
            {
            "term": {
            "Actor2Geo_FullName.keyword": Actor2Geo_FullName
            }
            }
            ,
            {
            "term": {
            "ActionGeo_FullName.keyword": ActionGeo_FullName
            }
            }
            ,
            {
            "term": {
            "MediaGeo.keyword": MediaGeo
            }
            }
            ,
            {
            "term": {
            "Title.keyword": Title
            }
            }
            ,
            {
            "term": {
            "Content.keyword": Content
            }
            }
            ]
            }
            },
            "from": 0,
            "size": volume,
            "sort": [ ],
            "aggs": { }
            }
#         print(queryBody)
        result = self.es.search(index=self.indexName, body=queryBody)
        return result

    def format_search(self, result):
        '''
        format_search方法对检索结果进行格式化 构建符合要求的字段进行返回
        输入result为检索结果 提取其中的检索结果进行后处理
        使用ES检索后得到的结果中result['hits']['hits']为数组格式数据
        其中每一个元素为一个dict 对应部分字段
        '''
        docs = result['hits']['hits']
        docs = [i['_source'] for i in docs]
        head = '`GLOBALEVENTID`, `SQLDATE`, `MonthYear`, `Year`, `FractionDate`, `Actor1Code`, `Actor1Name`, `Actor1CountryCode`, `Actor1KnownGroupCode`, `Actor1EthnicCode`, `Actor1Religion1Code`, `Actor1Religion2Code`, `Actor1Type1Code`, `Actor1Type2Code`, `Actor1Type3Code`, `Actor2Code`, `Actor2Name`, `Actor2CountryCode`, `Actor2KnownGroupCode`, `Actor2EthnicCode`, `Actor2Religion1Code`, `Actor2Religion2Code`, `Actor2Type1Code`, `Actor2Type2Code`, `Actor2Type3Code`, `IsRootEvent`, `EventCode`, `EventBaseCode`, `EventRootCode`, `QuadClass`, `GoldsteinScale`, `NumMentions`, `NumSources`, `NumArticles`, `AvgTone`, `Actor1Geo_Type`, `Actor1Geo_FullName`, `Actor1Geo_CountryCode`, `Actor1Geo_ADM1Code`, `Actor1Geo_ADM2Code`, `Actor1Geo_Lat`, `Actor1Geo_Long`, `Actor1Geo_FeatureID`, `Actor2Geo_Type`, `Actor2Geo_FullName`, `Actor2Geo_CountryCode`, `Actor2Geo_ADM1Code`, `Actor2Geo_ADM2Code`, `Actor2Geo_Lat`, `Actor2Geo_Long`, `Actor2Geo_FeatureID`, `ActionGeo_Type`, `ActionGeo_FullName`, `ActionGeo_CountryCode`, `ActionGeo_ADM1Code`, `ActionGeo_ADM2Code`, `ActionGeo_Lat`, `ActionGeo_Long`, `ActionGeo_FeatureID`, `DATEADDED`, `SOURCEURL`, `MediaGeo`, `Title`, `Content`'
        head = head.replace('`','')
        targetKeyList = [i.strip() for i in head.split(',')]
        dict_filter_by_keys = lambda d: {k: d[k] for k in targetKeyList}
        dict_filter_id = lambda d: {k if not k == '_id' else 'id': d[k] for k in d}
        docs = list(dict_filter_by_keys(doc) for doc in docs)
        return docs
    def Retrieval(self, Actor1Geo_FullName, Actor2Geo_FullName, ActionGeo_FullName,MediaGeo,Title,Content,volume):
        result = self.do_search(Actor1Geo_FullName, Actor2Geo_FullName, ActionGeo_FullName,MediaGeo,Title,Content,volume)
        docs = self.format_search(result)
        return docs

In [42]:
eee = esNewsRetrieval(host='10.8.128.205',port=49200,)

In [43]:
result= eee.Retrieval(Actor1Geo_FullName='', Actor2Geo_FullName=''
                      , ActionGeo_FullName='',MediaGeo='',Title='china',Content='',volume=100)

In [None]:
result