In [7]:
from snownlp import SnowNLP
import jieba
import pymysql
from jieba import analyse
import jieba.analyse
import numpy as np

In [8]:
from elasticsearch import Elasticsearch

In [9]:
import requests

In [11]:
import os

In [10]:
class esWeiboCommentRetrieval():
    '''
    根据论文检索需求进行功能的微调
    '''
    _instance = None
    _first_init = True
    def __new__(cls, *args, **kw):
        if not cls._instance:
            cls._instance = super(esWeiboCommentRetrieval, cls).__new__(cls)  
        return cls._instance
    
    def __init__(self, host, port):
        '''
        使用ES进行论文检索 指定host、port以及专利index之后进行检索
        '''
        super(esWeiboCommentRetrieval, self).__init__()
        self.es = Elasticsearch(hosts=host, port=port, timeout=30, max_retries=10, retry_on_timeout=True)
        self.indexName = 'weibo-comment-index'

    def do_search(self, wordQuery, textQuery, volume):
        '''
        do_search方法执行具体检索过程
        wordQuery 本应为查询对应微博时所用的检索词 此处暂时不用 目前暂时只检索微博评论数据后续再修改对象为
        
        volume为每次检索返回的数目
        '''
        queryBody = {
          "query": {
            "match": {
              "C_content": textQuery
            }
          },
          "from": 0,
          "size": volume,
          "sort": [],
          "aggs": {}
        }
        result = self.es.search(index=self.indexName, body=queryBody)
        return result

    def format_search(self, result):
        '''
        format_search方法对检索结果进行格式化 构建符合要求的字段进行返回
        输入result为检索结果 提取其中的检索结果进行后处理
        使用ES检索后得到的结果中result['hits']['hits']为数组格式数据
        其中每一个元素为一个dict 对应部分字段
        '''
        docs = result['hits']['hits']
        docs = [i['_source'] for i in docs]
        targetKeyList = 'C_ID,C_comment_id,C_comment_user_id,C_comment_user_nick_name,C_content,\
        C_weibo_url,C_like_num,C_created_at,C_crawl_time'
        targetKeyList = [i.strip() for i in targetKeyList.split(',')]
        dict_filter_by_keys = lambda d: {k: d[k] for k in targetKeyList}
        dict_filter_text = lambda d: {k if not k == 'text' else 'claim_text': d[k] for k in d}
        dict_filter_id = lambda d: {k if not k == '_id' else 'id': d[k] for k in d}
        docs = (dict_filter_by_keys(doc) for doc in docs)
        docs = [dict_filter_text(doc) for doc in docs]
        return docs
    def Retrieval(self, wordQuery, textQuery, volume):
        result = self.do_search(wordQuery, textQuery, volume)
        docs = self.format_search(result)
        return docs

In [1]:
body = {'wordQuery':'',
        'textQuery':'垃圾分类',
        'volume':1000}
baseUrl = 'http://10.8.128.205:29280/Lawbda/dataWare/1.0.0/weibo/search'
docs = requests.get(baseUrl,params=body)

In [32]:
class WeiboEventAnalysis:
    def __init__(self,):
        self.es = esWeiboCommentRetrieval(host='10.8.128.205',port=49200)
    def do_search(self,kword):
        result = self.es.Retrieval(wordQuery=kword, textQuery=kword, volume=1000)
        return result

    def get_event_words(self,docs):
        base_dict_path = './'
        dict_file_list = ["SogouLabDic.txt","dict_baidu_utf8.txt","dict_pangu.txt",
                          "dict_sougou_utf8.txt","dict_tencent_utf8.txt","my_dict.txt"]
        dict_file_path = [os.path.join(base_dict_path,fname) for fname in dict_file_list]
        for p in dict_file_path:jieba.load_userdict(p)
        stopwords = {}.fromkeys([ line.rstrip() for line in open('Stopword.txt','r',encoding='utf-8')])
        ID = []
        text = []
        ID = [i['C_ID'] for i in docs]
        text =[i['C_content'] for i in docs]
        # cut words
        result=[]
        for i in text:
            if i != False:
                seg = jieba.cut(i)
                for j in seg:
                    if j not in stopwords:  
                        result.append(j)
                result.append('\n')
            else:
                continue
        #extract keywords
        keywords_result=[]
        tfidf = analyse.extract_tags
        for line in result:
            text = line
            keywords = tfidf(text,allowPOS=('ns','nr','nt','nz','nl','n', 'vn','vd','vg','v','vf','a','an','i'))
            for keyword in keywords:
                keywords_result.append(keyword)
        return keywords_result
    
    def keywords_TextRank(self,keywords):

        lyric= ''
        for i in keywords:
            lyric+=i
        result=jieba.analyse.textrank(lyric,topK=50,withWeight=True)
        result2 = result[:10]
        w_keywords_50 = {i[0]:i[1] for i in result}
        w_keywords_10 = {i[0]:i[1] for i in result2}
        return w_keywords_50, w_keywords_10
    
    def get_sentiment_percentage(self,keywords):
        comment = []
        pos_count = 0
        neg_count = 0
        fracs = {}
        for line_data in keywords:
            comment = line_data
            s = SnowNLP(comment)
            rates = s.sentiments    
            if (rates >= 0.5):
                pos_count += 1
            elif (rates < 0.5):
                neg_count += 1
            else :
                pass
        pos_rate=pos_count/(pos_count + neg_count)
        neg_rate=neg_count/(pos_count + neg_count)
        fracs = {'积极': pos_rate ,'消极': neg_rate}
        return fracs
    def post_process(self,word_dict):
        return [{'name':k,'value':word_dict[k]} for k in word_dict]
        
    def SearchNAnalysis(self,kword):
        docs = self.do_search(kword)
        keywords_result = self.get_event_words(docs)
        w_keywords_50, w_keywords_10 = self.keywords_TextRank(keywords_result)
        fracs = self.get_sentiment_percentage(keywords_result)
        response = {
            'kwordTop50Cloud':w_keywords_50,
            'kwordTop10Bar':w_keywords_10,
            'sentimentPie':fracs
        }
        response = {k:self.post_process(response[k]) for k in response}
        return response

In [33]:
ppp = WeiboEventAnalysis()

In [34]:
ppp.SearchNAnalysis('垃圾分类')

{'kwordTop50Cloud': [{'name': '垃圾', 'value': 1.0},
  {'name': '分类', 'value': 0.7886273276514632},
  {'name': '洋垃圾', 'value': 0.18072583730500644},
  {'name': '超话', 'value': 0.04908631579908009},
  {'name': '垃圾桶', 'value': 0.04870221337287133},
  {'name': '哥哥', 'value': 0.03379394192200975},
  {'name': '回收', 'value': 0.025470424038297725},
  {'name': '跟着', 'value': 0.024755504905345397},
  {'name': '环境', 'value': 0.023755389264296333},
  {'name': '小心', 'value': 0.02371932582171862},
  {'name': '评论', 'value': 0.022764172476962156},
  {'name': '同款', 'value': 0.02242245495460441},
  {'name': '微笑', 'value': 0.021680423024325872},
  {'name': '老师', 'value': 0.021526182222226076},
  {'name': '中国', 'value': 0.020987584078350194},
  {'name': '鼓掌', 'value': 0.018942967718809357},
  {'name': '南京', 'value': 0.018919587660372456},
  {'name': '投放', 'value': 0.018676956886822773},
  {'name': '口红', 'value': 0.018146923935315768},
  {'name': '配图', 'value': 0.017700255821726477},
  {'name': '上海', 'value'