# 将检索模块和文本聚类模块进行封装

In [5]:
import time
import random
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime
from elasticsearch import Elasticsearch

In [6]:
import mysql.connector

# ES检索模块重构为一个对象

In [21]:
class esWeiboTweetRetrieval():
    '''
    根据论文检索需求进行功能的微调
    '''
    _instance = None
    _first_init = True
    def __new__(cls, *args, **kw):
        if not cls._instance:
            cls._instance = super(esWeiboTweetRetrieval, cls).__new__(cls)  
        return cls._instance
    
    def __init__(self, host, port):
        '''
        使用ES进行论文检索 指定host、port以及专利index之后进行检索
        '''
        super(esWeiboTweetRetrieval, self).__init__()
        self.es = Elasticsearch(hosts=host, port=port, timeout=30, max_retries=10, retry_on_timeout=True)
        self.indexName = 'weibo-tweet-index'

    def do_search(self, W_search_query, W_content, volume):
        '''
        do_search方法执行具体检索过程
        wordQuery 本应为查询对应微博时所用的检索词 此处暂时不用 目前暂时只检索微博评论数据后续再修改对象为
        
        volume为每次检索返回的数目
        '''
        queryBody = {
        "query": {
        "bool": {
        "must": [ ],
        "must_not": [ ],
        "should": [
        {
        "query_string": {
        "default_field": "W_content",
        "query": W_content
        }
        }
        ,
        {
        "query_string": {
        "default_field": "W_search_query",
        "query": W_search_query
        }
        }
        ]
        }
        },
        "from": 0,
        "size": volume,
        "sort": [ ],
        "aggs": { }
        }
        result = self.es.search(index=self.indexName, body=queryBody)
        return result

    def format_search(self, result):
        '''
        format_search方法对检索结果进行格式化 构建符合要求的字段进行返回
        输入result为检索结果 提取其中的检索结果进行后处理
        使用ES检索后得到的结果中result['hits']['hits']为数组格式数据
        其中每一个元素为一个dict 对应部分字段
        '''
        docs = result['hits']['hits']
        docs = [i['_source'] for i in docs]
        targetKeyList = 'W_ID,W_weibo_id,W_search_query,W_nick_name,W_weibo_url,W_created_at,\
        W_like_num,W_repost_num,W_comment_num,W_content,W_user_id,W_image_url,W_video_url,W_tool,W_location,\
        W_location_map_info,W_origin_weibo_url,W_origin_weibo_content,W_crawl_time'
        targetKeyList = [i.strip() for i in targetKeyList.split(',')]
        dict_filter_by_keys = lambda d: {k: d[k] for k in targetKeyList}
        dict_filter_id = lambda d: {k if not k == '_id' else 'id': d[k] for k in d}
        docs = list(dict_filter_by_keys(doc) for doc in docs)
        return docs
    def Retrieval(self, W_search_query, W_content, volume):
        result = self.do_search(W_search_query, W_content, volume)
        docs = self.format_search(result)
        return docs

In [22]:
eee = esWeiboTweetRetrieval(host='10.8.128.205',port=49200,)

In [23]:
result= eee.Retrieval(W_search_query='#垃圾分类#', W_content='北京',volume=100)

In [24]:
result

[{'W_ID': 521,
  'W_weibo_id': '1606087470_AtvCJ6J0Y',
  'W_search_query': '#垃圾分类#',
  'W_nick_name': '毛达1977',
  'W_weibo_url': 'https://weibo.com/1606087470/AtvCJ6J0Y',
  'W_created_at': '2014-01-24T18:15:10',
  'W_like_num': 5,
  'W_repost_num': 400,
  'W_comment_num': 165,
  'W_content': '#垃圾分类# 北京市法制宣传教育领导小组办公室、北京市司法局：“不分类 环境恶化、资源浪费”，“厨余垃圾→堆肥”。很想问问：现在北京厨余垃圾多少堆肥了？效率、效果、去向如何？能不能将混合垃圾和较纯厨余分开堆肥，对比一下？@北京市市政市容委@卫潘明@张红樱@立雯nu@垃圾战斗机FON@冯永锋',
  'W_user_id': '1606087470',
  'W_image_url': 'None',
  'W_video_url': 'None',
  'W_tool': '微博 weibo.com',
  'W_location': 'None',
  'W_location_map_info': 'None',
  'W_origin_weibo_url': 'None',
  'W_origin_weibo_content': 'None',
  'W_crawl_time': '1970-01-01T00:00:02'},
 {'W_ID': 3922,
  'W_weibo_id': '6602407876_HzFqUCQqL',
  'W_search_query': '#垃圾分类#',
  'W_nick_name': '王希希ss',
  'W_weibo_url': 'https://weibo.com/6602407876/HzFqUCQqL',
  'W_created_at': '2019-06-19T21:25:00',
  'W_like_num': 13,
  'W_repost_num': 11,
  'W_comment_num': 2,
  'W_con

# 客户端对于后端数据接口的访问

In [48]:
import requests

body = {'wordQuery':'我',
        'textQuery':'垃圾分类',
        'volume':1000}

baseUrl = 'http://10.8.128.205:29280/Lawbda/dataWare/1.0.0/weibo/search'

docs = requests.get(baseUrl,params=body)

In [49]:
docs.json()[0]

{'C_ID': 70125,
 'C_comment_id': 'C_4391845553021738',
 'C_comment_user_id': '6079113604',
 'C_comment_user_nick_name': '安心的温柔866',
 'C_content': '#杨洋 垃圾分类# #垃圾分类挑战#  杨洋 垃圾分类  #杨洋[超话]# @杨洋',
 'C_crawl_time': '1970-01-01T00:00:02',
 'C_created_at': '2019-07-08T18:17:00',
 'C_like_num': 1,
 'C_weibo_url': 'https://weibo.com/1625035922/HCwSG4b6Q'}