# 将政策文本按照情绪分类

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import sys
sys.path.append('/Users/yanyan/Documents/MyQuant/MarketSeparationBasedOnNLP/Code/Experiment/Evaluation')
from Evaluation import *
import jieba
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import logging
from gensim.test.utils import get_tmpfile
from sklearn.manifold import SpectralEmbedding, Isomap, MDS, TSNE
from scipy.stats import ks_2samp
from tqdm import tqdm
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
def tokenize_and_del_stopword(stopword_set, text: pd.Series):
    raw_texts = text.to_list()
    res = []
    for raw_text in tqdm(raw_texts):
        tokenized_text = jieba.lcut_for_search(raw_text)
        temp = []
        for word in tokenized_text:
            if not word in stopword_set:
                temp.append(word)
        res.append(temp)
    return res

def vector_explode(ori_ser):
        ori_df = pd.DataFrame(ori_ser)
        temp_list = []
        for i in tqdm(range(len(ori_df)), desc="Vector exploding"):
            temp_list.append(ori_df.applymap(lambda x: x.tolist()).values[i][0])
        temp_exploded = (
            pd.DataFrame(temp_list, index=ori_ser.index)
            .pipe(lambda x: x.reset_index())
        )
        return temp_exploded

stopwords = []
with open("Data/baidu_stopwords.txt", "r") as f:
    for line in f.readlines():
        line = line.strip('\n')  # 去掉列表中每一个元素的换行符
        stopwords.append(line)
stopwords = set(stopwords)

Industry_policy = (
    pd.read_csv('Data/行业政策/ED_IndustryPolicy.csv')
    .pipe(lambda x: pd.merge(pd.read_csv('Data/RESSET_INDPOLICY_1.csv'), x, left_on='观测ID()_ID', right_on='ID'))
    .assign(InfoPublDateNP=lambda x: x[['InfoPublDate']].applymap(lambda x: np.datetime64(x)))
    .assign(TokenContent=lambda x: tokenize_and_del_stopword(stopwords, x['Content']))
)

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(Industry_policy['TokenContent'])]
# model = Doc2Vec(documents, vector_size=10, window=4, min_count=1, workers=4)
fname = get_tmpfile("/Users/yanyan/Documents/MyQuant/MarketSeparationBasedOnNLP/Code（毕业论文）/Data/my_doc2vec_model")
# model.save(fname)
model = Doc2Vec.load(fname)  # you can continue training with the loaded model!

# 计算每一个doc的向量
temp_doc = []
for doc in tqdm(Industry_policy['TokenContent'], desc='计算doc对应的向量'):
    temp_doc.append(model.infer_vector(doc))

Industry_policy = (
    Industry_policy
    .assign(DocVec = temp_doc)
    .assign(InfoPublYear = lambda x: x[['InfoPublDateNP']].applymap(lambda x: x.year))
    .assign(InfoPublMon = lambda x: x[['InfoPublDateNP']].applymap(lambda x: x.month))
    .pipe(lambda x: x.loc[x['信息级别_InfoLevel']<=3])
)
#     # 整理时间序列文本向量
#     # 分信息级别合并向量为月频
#     for info_level in [1,2,3,4,5]:
#         temp = Industry_policy.loc[Industry_policy['信息级别_InfoLevel']==info_level].groupby(['InfoPublYear','InfoPublMon'])['DocVec'].mean()
#         temp_exploded = vector_explode(temp)
#         temp_exploded.to_csv('/Users/yanyan/Documents/MyQuant/MarketSeparationBasedOnNLP/Code（毕业论文）/Data/时间序列向量导出结果/DocVec_monthly_InfoLevel{}_20240207.csv'.format(info_level))

#     DocVec_daily = Industry_policy.groupby('InfoPublDateNP')['DocVec'].mean()
#     vector_explode(DocVec_daily).to_csv('/Users/yanyan/Documents/MyQuant/MarketSeparationBasedOnNLP/Code（毕业论文）/Data/时间序列向量导出结果/DocVec_daily_20240207.csv')
#     DocVec_monthly = Industry_policy.groupby(['InfoPublYear','InfoPublMon'])['DocVec'].mean()
#     vector_explode(DocVec_monthly).to_csv('/Users/yanyan/Documents/MyQuant/MarketSeparationBasedOnNLP/Code（毕业论文）/Data/时间序列向量导出结果/DocVec_monthly_20240207.csv')
#     print('Finished.')


100%|██████████| 22497/22497 [00:36<00:00, 609.51it/s]
2024-03-29 11:16:29,412 : INFO : loading Doc2Vec object from /Users/yanyan/Documents/MyQuant/MarketSeparationBasedOnNLP/Code（毕业论文）/Data/my_doc2vec_model
2024-03-29 11:16:29,441 : INFO : loading dv recursively from /Users/yanyan/Documents/MyQuant/MarketSeparationBasedOnNLP/Code（毕业论文）/Data/my_doc2vec_model.dv.* with mmap=None
2024-03-29 11:16:29,441 : INFO : loading wv recursively from /Users/yanyan/Documents/MyQuant/MarketSeparationBasedOnNLP/Code（毕业论文）/Data/my_doc2vec_model.wv.* with mmap=None
2024-03-29 11:16:29,442 : INFO : setting ignored attribute cum_table to None
2024-03-29 11:16:29,857 : INFO : Doc2Vec lifecycle event {'fname': '/Users/yanyan/Documents/MyQuant/MarketSeparationBasedOnNLP/Code（毕业论文）/Data/my_doc2vec_model', 'datetime': '2024-03-29T11:16:29.856978', 'gensim': '4.3.0', 'python': '3.10.12 (main, Jul  5 2023, 15:02:25) [Clang 14.0.6 ]', 'platform': 'macOS-14.2.1-arm64-arm-64bit', 'event': 'loaded'}
计算doc对应的向量: 100%

In [5]:
negative_words = set(pd.read_excel('/Users/yanyan/Downloads/中文金融情感词典_姜富伟等(2020).xlsx', sheet_name='negative').applymap(lambda x: x.strip())['Negative Word'])
positive_words = set(pd.read_excel('/Users/yanyan/Downloads/中文金融情感词典_姜富伟等(2020).xlsx', sheet_name='positive').applymap(lambda x: x.strip())['Positive Word'])

def count_feeling(token_list, words_set):
    num=0
    for i in token_list:
        if i in words_set: num+=1
    return num/len(token_list)

Industry_policy = (Industry_policy
                    .assign(pos_ratio = lambda x: x[['TokenContent']].applymap(lambda x: count_feeling(x, positive_words)))
                    .assign(neg_ratio = lambda x: x[['TokenContent']].applymap(lambda x: count_feeling(x, negative_words)))
                    .assign(sum_ratio = lambda x: x['pos_ratio']+x['neg_ratio'])
                   )

display(Industry_policy.head(1))

Unnamed: 0,观测ID()_ID,事件标志_EventFlg,信息发布日期_InfoPubDt,媒体出处代码()_MediaCd,媒体出处_Media,撰写机构_Writer,作者_Author,信息内容_Content,信息对象代码()_ObjCd,信息地域划分代码()_RegionCd,...,InfoTitle,Content,InfoPublDateNP,TokenContent,DocVec,InfoPublYear,InfoPublMon,pos_ratio,neg_ratio,sum_ratio
0,670676730611,1,2021-04-02,3,上海证券报,,,,1000,142.0,...,权益变动信息 披露知多少,小费阿姨：小钟老师，近期我关注的上市公司发布了临时公告，是关于大股东持有公司权益变动...,2021-04-02,"[小费, 阿姨, 小钟, 老师, 近期, 关注, 上市, 公司, 上市公司, 发布, 临时,...","[1.7802233, -0.99139917, -0.92465454, 0.270370...",2021,4,0.069606,0.018561,0.088167


In [9]:
# 按照pos_num-neg_num在横截面分组
res = []
for key, df in tqdm(Industry_policy.groupby(['InfoPublYear','InfoPublMon'])['sum_ratio']):
    if len(df)<3:
        res.extend(['neutral' for i in range(len(df))])
    else:
        res.extend(pd.qcut(df.to_list(),3,labels=['weak','neutral','strong']).to_list())
Industry_policy = Industry_policy.assign(feeling = res)
display(Industry_policy.head(1))

100%|██████████| 353/353 [00:00<00:00, 4738.13it/s]


Unnamed: 0,观测ID()_ID,事件标志_EventFlg,信息发布日期_InfoPubDt,媒体出处代码()_MediaCd,媒体出处_Media,撰写机构_Writer,作者_Author,信息内容_Content,信息对象代码()_ObjCd,信息地域划分代码()_RegionCd,...,Content,InfoPublDateNP,TokenContent,DocVec,InfoPublYear,InfoPublMon,pos_ratio,neg_ratio,sum_ratio,feeling
0,670676730611,1,2021-04-02,3,上海证券报,,,,1000,142.0,...,小费阿姨：小钟老师，近期我关注的上市公司发布了临时公告，是关于大股东持有公司权益变动...,2021-04-02,"[小费, 阿姨, 小钟, 老师, 近期, 关注, 上市, 公司, 上市公司, 发布, 临时,...","[1.7802233, -0.99139917, -0.92465454, 0.270370...",2021,4,0.069606,0.018561,0.088167,neutral


In [11]:
print('不同情绪政策的数量：')
print("弱情绪政策：", len(Industry_policy.loc[Industry_policy['feeling']=='weak']))
print("中情绪政策：", len(Industry_policy.loc[Industry_policy['feeling']=='neutral']))
print("强情绪政策：", len(Industry_policy.loc[Industry_policy['feeling']=='strong']))

不同情绪政策的数量：
弱情绪政策： 7368
中情绪政策： 7267
强情绪政策： 7265


In [12]:
# 整理时间序列文本向量
# 分信息级别合并向量为月频

from itertools import product
standard_timeline = (pd.DataFrame(list(product([i for i in range(1984,2024)],[i for i in range(1, 13)])))
                     .rename(columns={0:'InfoPublYear',1:'InfoPublMon'})
                    )

for feeling in ['weak','neutral','strong']:
    temp = Industry_policy.loc[Industry_policy['feeling']==feeling].groupby(['InfoPublYear','InfoPublMon'])['DocVec'].mean()
    temp_exploded = vector_explode(temp)
    temp_exploded = pd.merge(temp_exploded, standard_timeline, right_on = ['InfoPublYear','InfoPublMon'],\
                             left_on=['InfoPublYear','InfoPublMon'], how='right').fillna(method='ffill').dropna()
    temp_exploded = temp_exploded.loc[standard_timeline['InfoPublYear']>=2001].iloc[4:-7]
    temp_exploded.to_csv('/Users/yanyan/Documents/MyQuant/MarketSeparationBasedOnNLP/Code（毕业论文）/Data/时间序列向量导出结果/DocVec_monthly_InfoLevel123_feeling({})_20240329.csv'.format(feeling))

DocVec_monthly = Industry_policy.groupby(['InfoPublYear','InfoPublMon'])['DocVec'].mean()
vector_explode(DocVec_monthly).to_csv('/Users/yanyan/Documents/MyQuant/MarketSeparationBasedOnNLP/Code（毕业论文）/Data/时间序列向量导出结果/DocVec_monthly_InfoLevel123_feeling(all)_20240329.csv')
print('Finished.')

Vector exploding: 100%|██████████| 302/302 [00:00<00:00, 4848.76it/s]
Vector exploding: 100%|██████████| 300/300 [00:00<00:00, 6009.61it/s]
Vector exploding: 100%|██████████| 303/303 [00:00<00:00, 5899.87it/s]
Vector exploding: 100%|██████████| 353/353 [00:00<00:00, 5654.34it/s]

Finished.



