### 文本特征工程
0. 数据源：舆情事件（新闻、微信）、词向量（搜狗新闻）
1. 获取样本事件ID、事件名、情感分数
2. 对事件名进行分词、清洗
3. 建立词向量特征（n * maxlen * k ）
4. 建立词性特征 （n * maxlen * 1）
5. 建立词长特征 （n * maxlen * 1）
6. 建立负向词位置、数量特征 （n * 2）
7. 建立正向词位置、数量特征 （n * 2）

In [1]:
import pandas as pd
import numpy as np
import jieba
import bz2
import pymysql
import jieba.posseg as pseg
import collections
from functools import *

In [2]:
# dev
MYSQL_IP = '10.248.224.3'
MYSQL_PORT = 11202
MYSQL_USER = 'bigdata'
MYSQL_PSW = '3jHj8qid0ZxXn18'
MYSQL_DB = 'dmall_public_opinion'

In [3]:
def get_data_by_sql(sql):
    """
    通过sql获取数据
    :param sql:
    :return:
    """
    db = pymysql.connect(host=MYSQL_IP, port=MYSQL_PORT, user=MYSQL_USER, passwd=MYSQL_PSW,
                         db=MYSQL_DB, charset='utf8')
    return pd.read_sql(sql, con=db)


def execute_sqls(sqls):
    """
    执行sqls中的sql
    :param sqls:sql list
    :return:
    """
    db = pymysql.connect(host=MYSQL_IP, port=MYSQL_PORT, user=MYSQL_USER, passwd=MYSQL_PSW,
                         db=MYSQL_DB, charset='utf8mb4')
    cursor = db.cursor()
    count = 0
    try:
        for sql in sqls:
            # print(sql)
            cursor.execute(sql)
            count += 1
            if count == 50:
                db.commit()
                count = 0
        db.commit()
    except:
        raise Exception("SQL Execute Error %s" % sqls)
    finally:
        if not db._closed:
            db.close()


def add_user_words(user_dict):
    """
    添加用户词典
    :param user_dict:
    :return:
    """
    for word, tag in user_dict.items():
        jieba.add_word(word, freq=1000000, tag=tag)

def cut_word(text):
    """
    文本分词
    :param text:
    :return:
    """
    words = []
    words_flag=[]
    words_length=[]
    for item in pseg.cut(text,HMM=False):  
        word=item.word
        flag=item.flag
        if word in stop_dict or flag in ['x','eng']:
            continue
        words.append(word)
        words_flag.append(flag)
        words_length.append(len(word))
    return words,words_flag,words_length,len(words)

def get_dict(data,col_name):
    """
    获取数据集词典
    :param sql:
    :return:
    """
    all_value = reduce(lambda x, y: x + y,data[col_name])
    freq_sorted = sorted(collections.Counter(all_value).items(),
                               key=lambda item: item[1],
                               reverse=True)
    res_list = list(map(lambda item: item[0], freq_sorted))
    
    return res_list




In [4]:
# 全局变量
## 分词分析停用词、用户词典
stop_dict = []
user_dict = {"肿么了": "user"}
add_user_words(user_dict)

Building prefix dict from D:\Miniconda3\envs\python37\lib\site-packages\jieba\dict.txt ...
Dumping model to file cache C:\Users\hanyu\AppData\Local\Temp\jieba.cache
Loading model cost 1.4770698547363281 seconds.
Prefix dict has been built succesfully.


In [29]:
# 0. 数据源：舆情事件（新闻、微信）
sql="select event_id,event_name,sentiment,positive_score,other_score,negative_score,modified from event_sentiment where media_type=1 and sentiment is not null"

# 1. 获取样本事件ID、事件名、情感分数
data=get_data_by_sql(sql)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69412 entries, 0 to 69411
Data columns (total 7 columns):
event_id          69412 non-null object
event_name        69412 non-null object
sentiment         69412 non-null int64
positive_score    69412 non-null float64
other_score       69412 non-null float64
negative_score    69412 non-null float64
modified          69412 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(1), object(2)
memory usage: 3.7+ MB


In [30]:
data['is_fixed']=0

In [31]:
data.iloc[idx].is_fixed=1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [32]:
data.is_fixed[(data.modified<'2019-11-21') & (data.sentiment==-1)]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [33]:
data[data.is_fixed==1]

Unnamed: 0,event_id,event_name,sentiment,positive_score,other_score,negative_score,modified,is_fixed
6,B45F05B4DE4BEFCE6A1374E06406ED67,今天股市给散户上了生动的一课，后市区块链还能玩吗！,-1,0.000130,0.001573,0.998297,2019-11-19 17:20:21,1
7,581CADC35E3C486B1CD785FCA6D8AE2C,区块链为何无法拯救股市？,-1,0.000045,0.000158,0.999798,2019-11-19 17:20:21,1
9,556F926885B5A1A221EA1DAE4D1F8C98,英国股市：英国石油公司利润大降40% 拟出售100亿美元非核心资产,-1,0.001197,0.994967,0.003836,2019-11-19 17:20:21,1
10,EAF3D5A29B84571F5871CBF1BAB14E0F,美股新高，A股逆势下跌，区块链尾盘炸板！这是在演股市惊魂？,-1,0.391428,0.511699,0.096872,2019-11-19 17:20:21,1
11,8E58C14EA5F0E3494A1754C14CD00E59,10.30股市要闻：又1股爆雷！3家新股可申购，13股披露重要公告,-1,0.001308,0.021709,0.976982,2019-11-19 17:20:21,1
...,...,...,...,...,...,...,...,...
14849,1DD0AB14C2C12AD3D0B796413891E779,这个人比李斌更惨？！,-1,0.000044,0.000173,0.999784,2019-11-20 13:59:46,1
14860,AA29A576C06A479F3DA9AED3F457A8D3,NBA官方承认，漏判詹皇湖人2犯规，勒布朗赢球靠裁判？对手抗议,-1,0.036327,0.395167,0.568505,2019-11-20 13:59:46,1
14875,7E31D6D1C5F53E626048F089850B6CA7,中国球员真是没有人了，CBA本土球星排名出炉，35岁老将还是第一,-1,0.003896,0.066965,0.929139,2019-11-20 13:59:46,1
14876,BCE2B986AC7B3FDBE43CD2F4FAA7BFBA,联盟球员不满莫雷：他引起了事应该由他结束,-1,0.000631,0.009332,0.990037,2019-11-20 13:59:46,1


In [34]:
data.to_csv("data.csv", header=True, encoding='utf_8_sig', index=None)

In [7]:
# 2. 对事件名进行分词、清洗
# 3. 建立词向量特征（n * maxlen * k ）
# 4. 建立词性特征 （n * maxlen * 1）
# 5. 建立词长特征 （n * maxlen * 1）
words_with_flag=data.event_name.apply(lambda item:cut_word(item))
data['words']=words_with_flag.apply(lambda item:item[0])
data['words_flag']=words_with_flag.apply(lambda item:item[1])
data['words_length']=words_with_flag.apply(lambda item:item[2])
data['length']=words_with_flag.apply(lambda item:item[3])

In [None]:
# 6. 建立负向词位置、数量特征 （n * 2）
# 7. 建立正向词位置、数量特征 （n * 2）

In [8]:
# 获取词典
print("date length : %s"%len(data))
words_dict_list = get_dict(data,'words')
print("total words count : %s"%len(words_dict_list))
# 保存词典
np.save('words_dict',words_dict_list)

# 获取词频字典
words_flag_dict_list = get_dict(data,'words_flag')
print("total words_flag count : %s"%len(words_flag_dict_list))
# 保存词频字典
np.save('words_flag_dict',words_flag_dict_list)

date length : 69412


KeyboardInterrupt: 

In [25]:
# 数据集中词映射为词典编号
data['words_index'] = data.words.apply(
    lambda item:list(map( lambda word: words_dict_list.index(word),item)))

In [29]:
# 数据集中词映射为词典编号
data['words_flag_index'] = data.words_flag.apply(
    lambda item:list(map( lambda flag: words_flag_dict_list.index(flag),item)))

In [30]:
# 保存样本
data[['event_id','words','words_index','words_flag','words_flag_index','words_length','length','sentiment']].to_csv("data_featured.csv", header=True, encoding='utf_8_sig', index=None)

In [None]:
# 3. 根据样本词集合载入词向量
# 文件格式（词+向量）： word 1 2 3 4
# 词向量词典
embedding_matrix = np.zeros((len(words_dict_list), 300))
word_vector_count = 0
is_first_line = True
file = bz2.open('sgns.sogounews.bigram-char.bz2', mode='r')
for line in file:
    values = line.split()
    if is_first_line:
        print("word count : %s" % values[0])
        print("vector size : %s" % values[1])
        is_first_line = False
        continue
    word = values[0].decode('utf-8')
    if word in words_dict_list:
        index = words_dict_list.index(word)
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_matrix[index] = coefs
        word_vector_count = word_vector_count + 1

file.close()
print('Found %s word vectors.' % word_vector_count)

In [None]:
np.save('embedding_matrix',embedding_matrix)