In [1]:
import time
import random
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime
from elasticsearch import Elasticsearch
import mysql.connector

In [2]:
class esWeiboUserRetrieval():
    '''
    根据论文检索需求进行功能的微调
    '''
    _instance = None
    _first_init = True
    def __new__(cls, *args, **kw):
        if not cls._instance:
            cls._instance = super(esWeiboUserRetrieval, cls).__new__(cls)  
        return cls._instance
    
    def __init__(self, host, port):
        '''
        使用ES进行论文检索 指定host、port以及专利index之后进行检索
        '''
        super(esWeiboUserRetrieval, self).__init__()
        self.es = Elasticsearch(hosts=host, port=port, timeout=30, max_retries=10, retry_on_timeout=True)
        self.indexName = 'weibo-user-index'

    def do_search(self, U_nick_name, volume):
        '''
        do_search方法执行具体检索过程
        wordQuery 本应为查询对应微博时所用的检索词 此处暂时不用 目前暂时只检索微博评论数据后续再修改对象为
        
        volume为每次检索返回的数目
        '''
        queryBody = {
          "query": {
            "bool": {
              "must": [
                {
                  "query_string": {
                    "default_field": "U_nick_name",
                    "query": U_nick_name
                  }
                }
              ],
              "must_not": [],
              "should": []
            }
          },
          "from": 0,
          "size": volume,
          "sort": [],
          "aggs": {}
        }
        result = self.es.search(index=self.indexName, body=queryBody)
        return result

    def format_search(self, result):
        '''
        format_search方法对检索结果进行格式化 构建符合要求的字段进行返回
        输入result为检索结果 提取其中的检索结果进行后处理
        使用ES检索后得到的结果中result['hits']['hits']为数组格式数据
        其中每一个元素为一个dict 对应部分字段
        '''
        docs = result['hits']['hits']
        docs = [i['_source'] for i in docs]
        targetKeyList = 'U_ID,U_user_id,U_name,U_nick_name,U_location,U_sex,\
        U_sexual_orientation,U_Emotional_status,U_Birthday,U_constellation,U_crawl_time,\
        U_Blood,U_own_bolg_address_name,U_brief_introduction,U_Registration_time,U_QQ,U_email,U_MSN,\
        U_Work,U_educational,U_fans_Number,U_blog_Number,\
        U_credit,U_Head_Photo,U_senti,U_big5,U_big5_comment,U_concerns_Number,U_tag'
        targetKeyList = [i.strip() for i in targetKeyList.split(',')]
        dict_filter_by_keys = lambda d: {k: d[k] for k in targetKeyList}
        dict_filter_id = lambda d: {k if not k == '_id' else 'id': d[k] for k in d}
        docs = list(dict_filter_by_keys(doc) for doc in docs)
        return docs
    def Retrieval(self, U_nick_name, volume):
        result = self.do_search(U_nick_name, volume)
        docs = self.format_search(result)
        return docs

In [3]:
eee = esWeiboUserRetrieval(host='10.8.128.205',port=49200,)
result= eee.Retrieval(U_nick_name='飞',volume=100)

In [4]:
def Retrieve_page(user_list):
    dict_user = {}
    users = []
    for i in user_list:
        name = i["U_nick_name"]
        dict_user = {"U_nick_name":i["U_nick_name"],"U_blog_Number":i["U_blog_Number"],"U_fans_Number":i["U_fans_Number"],
             "U_concerns_Number":i["U_concerns_Number"],"U_tag":i["U_tag"]} 
        users.append(dict_user)
    return users

In [5]:
a = Retrieve_page(result)

In [6]:
a

[{'U_nick_name': '-林鸿飞-',
  'U_blog_Number': 7751,
  'U_fans_Number': 11146,
  'U_concerns_Number': 601,
  'U_tag': '                                     邓丽君                                                                         西藏旅游                                                                         社会计算                                                                         社交网络                                                                         瑞士                                                                         爱琴海                                                                         加德满都                                                                         古典诗歌                                                                         信息检索                                                                         意见挖掘                                    '}]

In [7]:
a = Retrieve_page(result*2)