### **新聞詞性標註，情緒計算**
- stopword.txt
- postive_word_YSL.xlsx
- negative_word_YSL.xlsx
- IPO_USECODE.xlsx

In [37]:
import pandas as pd
import numpy as np
import warnings
from datetime import timedelta
warnings.filterwarnings("ignore")

In [2]:
# 讀入新聞 (斷句ed 標示ed 取權重ed)
import glob

all_files = glob.glob( "n_*.xlsx")
df = []

for filename in all_files:
    df.append(pd.read_excel(filename))

news = pd.concat(df, axis=0, ignore_index=True)

In [3]:
# 讀入停用詞檔
stopwords=[]
with open('./stopwords.txt', 'r', encoding='utf-8') as file:
    for word in file.readlines():
        word = word.strip()
        stopwords.append(word)

#### **Hsieh 謝委霖字典**

In [4]:
postive_dict = pd.read_excel('postive_word_YSL.xlsx')

In [5]:
pos_ys = postive_dict.values.ravel().tolist()
pos_ys.remove(np.nan)

In [6]:
negative_dict = pd.read_excel('negative_word_YSL.xlsx')

In [7]:
neg_ys = negative_dict.values.ravel().tolist()
neg_ys.remove(np.nan)

* 修正辭典

In [8]:
# 移除不合理字
neg_ys.remove('領域')
neg_ys.remove('目標')
neg_ys.remove('開發')

# 新增字詞
neg_ys.append('持平')
neg_ys.append('惡化')
neg_ys.append('赤字')
neg_ys.append('告吹')
neg_ys.append('波及')
neg_ys.append('悽慘')
neg_ys.append('悲觀')

pos_ys.append('調升')
pos_ys.append('助於')
pos_ys.append('湧進')
pos_ys.append('出色')
pos_ys.append('研發')
pos_ys.append('獲利')
pos_ys.append('帶動')

#### **計算新聞中正負向字詞**

* 清理文字

In [9]:
def word_list(x,list_name):
    a = x.replace('[','').replace(']',"").replace("'","").replace(" ","").split(',')
    list_name.append(a)
    return a

In [10]:
list_ = []
news['word_seg'].apply(lambda x: word_list(x,list_))
news['word_clean'] = list_

In [11]:
remove_list = ['?', '？', '!', '！', '。', ',',  '，', ';', ':', '、', '：', '；', '】', '【', '（', '）', '(', ')', '[', ']', '●', '／', '「', '」']
news['word_clean'] = news['word_clean'].apply(lambda x: [i for i in x if i not in remove_list])

In [12]:
news['word_final'] = news['word_clean'].apply(lambda x: [i for i in x if i not in stopwords])

In [13]:
# news.drop(100, inplace = True)
news = news.reset_index(drop = True)

* 篩出正向文字

In [14]:
def word_pos(x,list_name):
    l = []
    for i in x:
        if i in pos_ys:
            l.append(i)
    list_name.append(l)

In [15]:
list_pos = []
news['word_clean'].apply(lambda x:word_pos(x,list_pos))
news['word_pos'] = list_pos

In [16]:
remove_list = ['有益', '創意', '安可']
news.loc[news['name'].isin(['創意', '安可', '有益']),'word_pos'] = news.loc[news['name'].isin(['創意', '安可', '有益']),'word_pos'].apply(lambda x: [i for i in x if i not in remove_list])

* 篩出負向文字

In [17]:
def word_neg(x,list_name):
    l = []
    for i in x:
        if i in neg_ys:
            l.append(i)
    list_name.append(l)

In [18]:
list_neg = []
news['word_clean'].apply(lambda x:word_neg(x,list_neg))
news['word_neg'] = list_neg

* 計算正負向字詞數

In [19]:
news['raw_article'] = news['word_clean'].apply(lambda x:len(x))
news['len_article'] = news['word_final'].apply(lambda x:len(x))
news['raw_pos'] = news['word_pos'].apply(lambda x:len(x))
news['raw_neg'] = news['word_neg'].apply(lambda x:len(x))

* 調整否定詞的影響

In [20]:
# find index of each postive words in news
list_index = []
for i in range(5667):
    search_set = set(news['word_pos'][i])
    l_ = []
    for j in search_set:
        l_.extend(np.where(np.array(news['word_clean'][i]) == j)[0])
    list_index.append(l_)

In [21]:
# find previous and next 2 words of postive words
word_token = []
for i in range(5667):
    token = []
    for j in list_index[i]:
        token.append(news['word_clean'][i][j-2:j+3])
    word_token.append(token)

In [22]:
negation_word = ['不', '不易','沒有', '還沒', '還','不會', '擺脫', '免去', 
                 '避免', '非常', '無', '沒', '仍需', '解決', '走']

In [23]:
# check if negation word exist in the pre/next words of postive words
word_count = []
for i in word_token:
    count = 0
    for j in i:
        if len(list(set(negation_word).intersection(set(j)))) > 0:
            count +=1
    word_count.append(count)

In [24]:
news['adjusted_pos'] = word_count

In [25]:
# 調整後的正負向字詞個數
news['len_pos'] = news['raw_pos'] - news['adjusted_pos']
news['len_neg'] = news['raw_neg'] + news['adjusted_pos']

#### **計算情緒(sentiment)**

In [26]:
# 篩選公司
IPO_CODE = pd.read_excel('./IPO_USECODE.xlsx')
IPO_name_list = IPO_CODE['name'].unique().tolist()
news = news[news['name'].isin(IPO_name_list)].reset_index(drop = True)

* 計算 Sentiment

In [27]:
# 正負向詞彙比重
news['P'] = news['len_pos']/news['len_article']
news['N'] = news['len_neg']/news['len_article']

In [28]:
# 計算情緒
news['sentiment'] = (news['len_pos'] - news['len_neg'])/news['len_article']

In [29]:
news['len_sentiment'] =  news['len_pos'] - news['len_neg']

In [30]:
news.fillna(0, inplace = True)

In [31]:
# save as pickle
news.to_pickle('news.pkl')

* 計算平均 Sentiment

In [33]:
sentiment = news.groupby('name').agg({'len_pos':'mean', 'len_neg':'mean', 'len_sentiment':'mean',
                                      'P':'mean', 'N':'mean','sentiment':'mean','name':'count'})\
                                .rename(columns = {'name':'n_article'}).reset_index()

In [None]:
# sentiment.to_pickle('sentiment.pkl')

In [53]:
# sentiment.to_excel('sentiment.xlsx', index = False)
# sentiment.to_excel('sentiment_ys_negation_1.xlsx', index = False)

* 計算上市前7天以前的 Sentiment

In [34]:
IPO = pd.read_excel('IPO_RawData.xlsx')

In [35]:
def clean(df):
    df['underprice'] = df['close_price']/df['offer_price'] - 1 #IPO折價
    df['stock_code'] = df['name'].str[:4] #擷取公司股票代碼
    df['name'] = df['name'].str[5:] #擷取公司名稱
clean(IPO)

In [38]:
# 篩選IPO前七天日期
IPO['ipo_date_minus'] = IPO['ipo_date'] - timedelta(days=7)
IPO = IPO[IPO['name'].isin(news['name'].unique().tolist())].reset_index(drop = True)

In [39]:
# 篩選新聞
news_turnover = pd.DataFrame({'content':[], 'date':[], 'name':[], 'title':[], 'word_es':[], 'word_pos':[], 'word_seg':[],
                              'word_clean':[], 'word_final':[], 'word_neg':[], 'raw_article':[], 'len_article':[],
                              'raw_pos':[], 'raw_neg':[], 'adjusted_pos':[], 'len_pos':[], 'len_neg':[], 'P':[], 'N':[],
                              'polarity':[], 'sentiment':[], 'len_sentiment':[]})

In [None]:
for i,j in zip(IPO['name'], IPO['ipo_date_minus']):
    news_turnover = news_turnover.append(news[(news['name'] == i) & (news['date'] < j)])
# news_turnover = news_turnover(drop = True)

In [121]:
sentiment_turnover = news_turnover.groupby('name')\
                     .agg({'len_pos':'mean', 'len_neg':'mean', 'len_sentiment':'mean',
                           'P':'mean', 'N':'mean','sentiment':'mean','name':'count'})\
                     .rename(columns = {'name':'n_article'}).reset_index()

In [124]:
sentiment_turnover.columns = ['name','len_pos_turnover', 'len_neg_turnover', 'len_sentiment_turnover', 'P_turnover', 'N_turnover', 
                              'sentiment_turnover', 'polarity_turnover', 'n_article_turnover']

In [None]:
sentiment = sentiment.merge(sentiment_turnover, on = 'name', how = 'left')

In [125]:
sentiment.to_excel('sentiment.xlsx', index = False)

In [None]:
sentiment.to_pickle('sentiment.pkl')