In [21]:
import os
import csv
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

import nltk
nltk.download('opinion_lexicon')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import opinion_lexicon
from nltk.sentiment.vader import SentimentIntensityAnalyzer


[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /home/dahamkim/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


In [22]:
vader_dir = os.path.join(os.getcwd(),'data','VADER')
news_dir = os.path.join(os.getcwd(),'data','news')

In [23]:
today = datetime.today().strftime("%Y-%m-%d")
yesterday = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
tomorrow = (datetime.today() + timedelta(days=1)).strftime("%Y-%m-%d")

In [24]:
news_text_data  = pd.read_excel(os.path.join(news_dir,f'{yesterday}_news.xlsx'), index_col=0)
news_sentiment_data  = pd.DataFrame(columns =['source','topic','title','publish_date','link','keywords','text'])

In [25]:
news_text_data

Unnamed: 0,date,publish_date,source,topic,title,keywords,link,text
0,2022-03-21,2022-03-21 14:04,nytimes.com,world,Ukraine Live Updates: Mariupol Refuses to Yiel...,kyiv/month/updates/city/russian/lviv/ukraine/w...,https://www.nytimes.com/live/2022/03/21/world/...,Ukraine Dispatch\n\nFor millions of internally...
1,2022-03-21,2022-03-21 14:29,nytimes.com,world,Live Updates: China Eastern Plane Crashes With...,crashes/chinese/live/flying/updates/guangxi/ea...,https://www.nytimes.com/live/2022/03/21/world/...,Video A Boeing 737-800 passenger plane operate...
2,2022-03-21,2022-03-21 07:00,nytimes.com,world,Japan Saved Red-Crowned Cranes. Can They Survi...,kushiro/yamazakis/mr/field/corn/revered/japan/...,https://www.nytimes.com/2022/03/21/world/asia/...,"KUSHIRO, Japan — The dance of the red-crowned ..."
3,2022-03-21,2022-03-21 09:00,nytimes.com,world,How Lviv Has Been Transformed by War,day/wrapped/tourists/cities/suitcases/windows/...,https://www.nytimes.com/2022/03/21/world/europ...,A family stands on a corner with their suitcas...
4,2022-03-21,2022-03-21 13:21,nytimes.com,world,"Creating Space Command, Australia Strengthens ...",military/australias/dutton/strengthens/hightec...,https://www.nytimes.com/2022/03/21/world/austr...,"CANBERRA, Australia — Two years after the Unit..."
...,...,...,...,...,...,...,...,...
209,2022-03-21,2022-03-21 04:30,ft.com/,tech,Tech embraces sustainability in drive for growth,position/sustainability/companies/uk/startup/s...,https://www.ft.com/content/52fff0ab-adb9-43ab-...,© Alex Hahn\n\nThe pandemic has drawn a clear ...
210,2022-03-21,2022-03-21 04:30,ft.com/,tech,Barcelona’s tech talent hub seeks more governm...,support/companies/business/world/barcelona/eur...,https://www.ft.com/content/01c89a2d-87e7-41bd-...,Barcelona has consolidated its place among Eur...
211,2022-03-21,2022-03-21 04:30,ft.com/,tech,Start-ups should not let a good energy crisis ...,higher/issues/companies/costs/rise/energy/busi...,https://www.ft.com/content/018aab32-07f1-427a-...,"For most start-up businesses, energy is genera..."
212,2022-03-21,2022-03-21 04:00,ft.com/,tech,Blood testing start-up Osler to tap investors ...,read/political/subscribe/trust/spot/world/oppo...,https://www.ft.com/content/54821c5e-9b37-49d1-...,Make informed decisions with the FT\n\nKeep ab...


In [26]:
news_sentiment_data

Unnamed: 0,source,topic,title,publish_date,link,keywords,text


In [27]:
# lexicon 가져오기 ( lexicon: {'word1' : 'score1', 'word2' : 'score2', ... } )
def load_lm_lexicon():
    # Loughran McDonald
    fn = os.path.join(vader_dir, "LoughranMcDonald_2016.csv")
    word_sent = pd.read_csv(fn)
    
    words2weights = {}
    for i in range(len(word_sent)):
        pos_score = 1. if word_sent.loc[i,'Positive'] != "0" else 0.
        neg_score = 1. if word_sent.loc[i,'Negative'] != "0" else 0.
        sentiment_score = pos_score - neg_score
        w = str(word_sent.loc[i, 'Word']).lower()

        # positive / negative labeling
        if sentiment_score:
            words2weights[w] = sentiment_score
    return words2weights

In [28]:
def load_hl_lexicon(_words2weights):
    # Hu and Liu 2004 opinion lexicon
    # opinion_lexicon is imported from nltk.corpus package
    _words2weights = {w: 1.0 for w in opinion_lexicon.positive()}
    _words2weights.update({w: -1.0 for w in opinion_lexicon.negative()})
    return _words2weights

In [29]:
def load_news_vader_lexicon(lexicon_dir):
    # vader lexicon specialized in finance domain
    fn = os.path.join(lexicon_dir, "ns.vader.sentences.20k.csv")
    df = pd.read_csv(fn)
    words2weights = dict(zip(df['word'].values, df['sentiment'].values))
    return words2weights

In [30]:
def combine_lexicons(lexicons):
    # input(list): [lm_lexicon, hl_lexicon, vader lexicon]
    # and returns the union
    lexicons.reverse()
    words2weights = {}

    for lex in lexicons:
        for w in lex:
            words2weights.setdefault(w, 0.0)
            words2weights[w] += lex[w]
    
    return words2weights

In [31]:
def lexicon_scoring(text, lexicon):
    # tokenize all texts
    words = TweetTokenizer(preserve_case=False).tokenize(text)
    # words에서 단어별로 체크 후 합치기. w.lower()만 하면 key 없어버리면 None이니 key 없으면 0.0 처리
    score = sum([lexicon.get(w.lower(), 0.0) for w in words])    
    score = score/len(words)
    
    return score

In [32]:

def negated_lexicon_scoring(text, lexicon):
    NEGATION_WORDS = set(nltk.sentiment.vader.VaderConstants.NEGATE)

    words = TweetTokenizer(preserve_case=False).tokenize(text)
    score = 0.0
    
    for i, w in enumerate(words):
        # 문맥에 부정어 있는지 봐야 하니까 체크 중인 word 앞 3단어 체크
        context = words[max(0, i-3):i]
        nega_adjust = 1.0
        # negation word(부정어) 있음 (ex: bad가 not bad가 되어서 부정적인 뜻이 아니게 됨)
        if set(context) & NEGATION_WORDS:
            nega_adjust = -1.0
        score += (nega_adjust * lexicon.get(w.lower(), 0.0))
    score = score / len(words)
    return score

In [33]:
df_target = pd.merge(news_text_data, news_sentiment_data, how='outer', indicator='Exist')
df_target= df_target.loc[df_target['Exist']=='left_only']
df_target = df_target.loc[:, ['source','topic','title','publish_date','link','keywords','text']]
df_target = df_target.reset_index(drop=True)

In [34]:
df_target

Unnamed: 0,source,topic,title,publish_date,link,keywords,text
0,nytimes.com,world,Ukraine Live Updates: Mariupol Refuses to Yiel...,2022-03-21 14:04,https://www.nytimes.com/live/2022/03/21/world/...,kyiv/month/updates/city/russian/lviv/ukraine/w...,Ukraine Dispatch\n\nFor millions of internally...
1,nytimes.com,world,Live Updates: China Eastern Plane Crashes With...,2022-03-21 14:29,https://www.nytimes.com/live/2022/03/21/world/...,crashes/chinese/live/flying/updates/guangxi/ea...,Video A Boeing 737-800 passenger plane operate...
2,nytimes.com,world,Japan Saved Red-Crowned Cranes. Can They Survi...,2022-03-21 07:00,https://www.nytimes.com/2022/03/21/world/asia/...,kushiro/yamazakis/mr/field/corn/revered/japan/...,"KUSHIRO, Japan — The dance of the red-crowned ..."
3,nytimes.com,world,How Lviv Has Been Transformed by War,2022-03-21 09:00,https://www.nytimes.com/2022/03/21/world/europ...,day/wrapped/tourists/cities/suitcases/windows/...,A family stands on a corner with their suitcas...
4,nytimes.com,world,"Creating Space Command, Australia Strengthens ...",2022-03-21 13:21,https://www.nytimes.com/2022/03/21/world/austr...,military/australias/dutton/strengthens/hightec...,"CANBERRA, Australia — Two years after the Unit..."
...,...,...,...,...,...,...,...
209,ft.com/,tech,Tech embraces sustainability in drive for growth,2022-03-21 04:30,https://www.ft.com/content/52fff0ab-adb9-43ab-...,position/sustainability/companies/uk/startup/s...,© Alex Hahn\n\nThe pandemic has drawn a clear ...
210,ft.com/,tech,Barcelona’s tech talent hub seeks more governm...,2022-03-21 04:30,https://www.ft.com/content/01c89a2d-87e7-41bd-...,support/companies/business/world/barcelona/eur...,Barcelona has consolidated its place among Eur...
211,ft.com/,tech,Start-ups should not let a good energy crisis ...,2022-03-21 04:30,https://www.ft.com/content/018aab32-07f1-427a-...,higher/issues/companies/costs/rise/energy/busi...,"For most start-up businesses, energy is genera..."
212,ft.com/,tech,Blood testing start-up Osler to tap investors ...,2022-03-21 04:00,https://www.ft.com/content/54821c5e-9b37-49d1-...,read/political/subscribe/trust/spot/world/oppo...,Make informed decisions with the FT\n\nKeep ab...


In [35]:
# combined lexicon
lex = combine_lexicons([load_lm_lexicon(), load_hl_lexicon(load_lm_lexicon()), load_news_vader_lexicon(vader_dir)])
# 부정어 조정 안 함
sent_score = []
# 부정어 조정함
sent_score_adj = []

for i in range(len(df_target)):
    try:
        # column index 6: news text
        sent_score.append(lexicon_scoring(df_target.iloc[i,6], lex))
        sent_score_adj.append(negated_lexicon_scoring(df_target.iloc[i,6], lex))
        #print(i)
    # text 없는 경우 sentiment score 0으로 대체
    except ZeroDivisionError as e:
        sent_score.append(0)
        sent_score_adj.append(0)
        #print(e)
            
news_text_data_with_sentiment_score = df_target
news_text_data_with_sentiment_score['sent_score'] = sent_score
news_text_data_with_sentiment_score['sent_score_adj'] = sent_score_adj

In [36]:
news_text_data_with_sentiment_score = news_text_data_with_sentiment_score[['publish_date', 'sent_score', 'sent_score_adj']]
news_text_data_with_sentiment_score.to_excel(os.path.join(news_dir,'news_sentiment_score.xlsx'))