### 文本特征工程
0. 数据源：舆情事件（新闻、微信）、词向量（搜狗新闻）
1. 获取样本事件ID、事件名、情感分数
2. 对事件名进行分词、清洗
3. 建立词向量特征（n * maxlen * k ）
4. 建立词性特征 （n * maxlen * 1）
5. 建立词长特征 （n * maxlen * 1）
6. 建立负向词位置、数量特征 （n * 2）
7. 建立正向词位置、数量特征 （n * 2）

In [21]:
import pandas as pd
import numpy as np
import jieba
import bz2
import pymysql
import jieba.posseg as pseg
import collections
from functools import *

In [15]:
# dev
MYSQL_IP = '10.248.224.3'
MYSQL_PORT = 11202
MYSQL_USER = 'bigdata'
MYSQL_PSW = '3jHj8qid0ZxXn18'
MYSQL_DB = 'dmall_public_opinion'

In [27]:
def get_data_by_sql(sql):
    """
    通过sql获取数据
    :param sql:
    :return:
    """
    db = pymysql.connect(host=MYSQL_IP, port=MYSQL_PORT, user=MYSQL_USER, passwd=MYSQL_PSW,
                         db=MYSQL_DB, charset='utf8')
    return pd.read_sql(sql, con=db)

def add_user_words(user_dict):
    """
    添加用户词典
    :param user_dict:
    :return:
    """
    for word, tag in user_dict.items():
        jieba.add_word(word, freq=1000000, tag=tag)

def cut_word(text):
    """
    文本分词
    :param text:
    :return:
    """
    words = []
    words_flag=[]
    words_length=[]
    for item in pseg.cut(text,HMM=False):  
        word=item.word
        flag=item.flag
        if word in stop_dict or flag in ['x','eng']:
            continue
        words.append(word)
        words_flag.append(flag)
        words_length.append(len(word))
    return words,words_flag,words_length,len(words)

def get_sentiment(row):
    """
    获得舆情情感值
    :param text:
    :return:
    """
    scores=np.array([row["negative_score"],row["other_score"],row["positive_score"]])
    return scores.argmax()-1

def get_dict(data,col_name):
    """
    获取数据集词典
    :param sql:
    :return:
    """
    all_value = reduce(lambda x, y: x + y,data[col_name])
    freq_sorted = sorted(collections.Counter(all_value).items(),
                               key=lambda item: item[1],
                               reverse=True)
    res_list = list(map(lambda item: item[0], freq_sorted))
    
    return res_list




In [13]:
# 全局变量
## 分词分析停用词、用户词典
stop_dict = []
user_dict = {"肿么了": "user"}
add_user_words(user_dict)

In [9]:
# 0. 数据源：舆情事件（新闻、微信）
sql="select event_id,event_name,sentiment from event_sentiment where media_type=1 and sentiment is not null"

# 1. 获取样本事件ID、事件名、情感分数
data=get_data_by_sql(sql)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24700 entries, 0 to 24699
Data columns (total 3 columns):
event_id      24700 non-null object
event_name    24700 non-null object
sentiment     24700 non-null int64
dtypes: int64(1), object(2)
memory usage: 579.0+ KB


In [18]:
# 2. 对事件名进行分词、清洗
# 3. 建立词向量特征（n * maxlen * k ）
# 4. 建立词性特征 （n * maxlen * 1）
# 5. 建立词长特征 （n * maxlen * 1）
words_with_flag=data.event_name.apply(lambda item:cut_word(item))
data['words']=words_with_flag.apply(lambda item:item[0])
data['words_flag']=words_with_flag.apply(lambda item:item[1])
data['words_length']=words_with_flag.apply(lambda item:item[2])
data['length']=words_with_flag.apply(lambda item:item[3])

In [None]:
# 6. 建立负向词位置、数量特征 （n * 2）
# 7. 建立正向词位置、数量特征 （n * 2）

In [23]:
# 获取词典
print("date length : %s"%len(data))
words_dict_list = get_dict(data,'words')
print("total words count : %s"%len(words_dict_list))
# 保存词典
np.save('words_dict',words_dict_list)

# 获取词频字典
words_flag_dict_list = get_dict(data,'words_flag')
print("total words_flag count : %s"%len(words_flag_dict_list))
# 保存词频字典
np.save('words_flag_dict',words_flag_dict_list)

date length : 24700
total words count : 28533


In [25]:
# 数据集中词映射为词典编号
data['words_index'] = data.words.apply(
    lambda item:list(map( lambda word: words_dict_list.index(word),item)))

In [29]:
# 数据集中词映射为词典编号
data['words_flag_index'] = data.words_flag.apply(
    lambda item:list(map( lambda flag: words_flag_dict_list.index(flag),item)))

In [30]:
# 保存样本
data[['event_id','words','words_index','words_flag','words_flag_index','words_length','length','sentiment']].to_csv("data_featured.csv", header=True, encoding='utf_8_sig', index=None)

In [None]:
# 3. 根据样本词集合载入词向量
# 文件格式（词+向量）： word 1 2 3 4
# 词向量词典
embedding_matrix = np.zeros((len(words_dict_list), 300))
word_vector_count = 0
is_first_line = True
file = bz2.open('sgns.sogounews.bigram-char.bz2', mode='r')
for line in file:
    values = line.split()
    if is_first_line:
        print("word count : %s" % values[0])
        print("vector size : %s" % values[1])
        is_first_line = False
        continue
    word = values[0].decode('utf-8')
    if word in words_dict_list:
        index = words_dict_list.index(word)
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_matrix[index] = coefs
        word_vector_count = word_vector_count + 1

file.close()
print('Found %s word vectors.' % word_vector_count)

In [None]:
np.save('embedding_matrix',embedding_matrix)