# 将检索模块和文本聚类模块进行封装

In [1]:
import time
import random
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime
from elasticsearch import Elasticsearch

In [2]:
import mysql.connector

# ES检索模块重构为一个对象

In [3]:
class esWeiboUserRetrieval():
    '''
    根据论文检索需求进行功能的微调
    '''
    _instance = None
    _first_init = True
    def __new__(cls, *args, **kw):
        if not cls._instance:
            cls._instance = super(esWeiboUserRetrieval, cls).__new__(cls)  
        return cls._instance
    
    def __init__(self, host, port):
        '''
        使用ES进行论文检索 指定host、port以及专利index之后进行检索
        '''
        super(esWeiboUserRetrieval, self).__init__()
        self.es = Elasticsearch(hosts=host, port=port, timeout=30, max_retries=10, retry_on_timeout=True)
        self.indexName = 'weibo-user-index'

    def do_search(self, U_nick_name, volume):
        '''
        do_search方法执行具体检索过程
        wordQuery 本应为查询对应微博时所用的检索词 此处暂时不用 目前暂时只检索微博评论数据后续再修改对象为
        
        volume为每次检索返回的数目
        '''
        queryBody = {
          "query": {
            "bool": {
              "must": [
                {
                  "query_string": {
                    "default_field": "U_nick_name",
                    "query": U_nick_name
                  }
                }
              ],
              "must_not": [],
              "should": []
            }
          },
          "from": 0,
          "size": volume,
          "sort": [],
          "aggs": {}
        }
        result = self.es.search(index=self.indexName, body=queryBody)
        return result

    def format_search(self, result):
        '''
        format_search方法对检索结果进行格式化 构建符合要求的字段进行返回
        输入result为检索结果 提取其中的检索结果进行后处理
        使用ES检索后得到的结果中result['hits']['hits']为数组格式数据
        其中每一个元素为一个dict 对应部分字段
        '''
        docs = result['hits']['hits']
        docs = [i['_source'] for i in docs]
        targetKeyList = 'U_ID,U_user_id,U_name,U_nick_name,U_location,U_sex,\
        U_sexual_orientation,U_Emotional_status,U_Birthday,U_constellation,U_crawl_time,\
        U_Blood,U_own_bolg_address_name,U_brief_introduction,U_Registration_time,U_QQ,U_email,U_MSN,\
        U_Work,U_educational,U_fans_Number,U_blog_Number,\
        U_credit,U_Head_Photo,U_senti,U_big5,U_big5_comment,U_concerns_Number,U_tag'
        targetKeyList = [i.strip() for i in targetKeyList.split(',')]
        dict_filter_by_keys = lambda d: {k: d[k] for k in targetKeyList}
        dict_filter_id = lambda d: {k if not k == '_id' else 'id': d[k] for k in d}
        docs = list(dict_filter_by_keys(doc) for doc in docs)
        return docs
    def Retrieval(self, U_nick_name, volume):
        result = self.do_search(U_nick_name, volume)
        docs = self.format_search(result)
        return docs

In [4]:
eee = esWeiboUserRetrieval(host='10.8.128.205',port=49200,)

In [5]:
result= eee.Retrieval(U_nick_name='鸿飞',volume=100)

In [9]:
pp =result*2

In [11]:
for i in result:
    print(i['U_nick_name'])

-林鸿飞-


# 客户端对于后端数据接口的访问

In [48]:
import requests

body = {'wordQuery':'我',
        'textQuery':'垃圾分类',
        'volume':1000}

baseUrl = 'http://10.8.128.205:29280/Lawbda/dataWare/1.0.0/weibo/search'

docs = requests.get(baseUrl,params=body)

In [49]:
docs.json()[0]

{'C_ID': 70125,
 'C_comment_id': 'C_4391845553021738',
 'C_comment_user_id': '6079113604',
 'C_comment_user_nick_name': '安心的温柔866',
 'C_content': '#杨洋 垃圾分类# #垃圾分类挑战#  杨洋 垃圾分类  #杨洋[超话]# @杨洋',
 'C_crawl_time': '1970-01-01T00:00:02',
 'C_created_at': '2019-07-08T18:17:00',
 'C_like_num': 1,
 'C_weibo_url': 'https://weibo.com/1625035922/HCwSG4b6Q'}