# Sentiment Analysis for Predicting Stock Market Movements using News Headlines

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Library Import

In [3]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import tqdm

In [4]:
#Sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from collections import Counter
import nltk
nltk.download('vader_lexicon')
#PoS Tagging
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

### Data import

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/pre_process_news_days.csv', encoding = "ISO-8859-1") 
#df = pd.read_csv('dataset/pre_process_news_days.csv', encoding = "ISO-8859-1")

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

### Sentiment score extraction

#### Vader - NTUSD-Fin 

In [7]:
vader = SentimentIntensityAnalyzer()

In [8]:
#Fin_lex = pd.read_json("NTUSD-Fin/NTUSD_Fin_word_v1.0.json")
Fin_lex = pd.read_json("/content/drive/MyDrive/Colab Notebooks/NTUSD_Fin_word_v1.0.json")

In [10]:
Fin_lex.head(3)

Unnamed: 0,chi_squared,bull_cfidf,bear_freq,bull_freq,bear_cfidf,word_vec,token,market_sentiment
0,90.655896,75.833011,9,88,40.806634,"[0.039068453013896005, -0.07342094182968101, 0...",notes,1.081211
1,21.315319,57.713857,0,16,0.0,"[0.129439651966094, -0.088906019926071, 0.0875...",manipulator,1.221692
2,49.89136,67.461386,17,36,47.397381,"[-0.050365339964628004, 0.09117619693279201, -...",sucked,-1.507409


In [11]:
fin_lex = Fin_lex[['token','market_sentiment']]
word = fin_lex['token']
sentiment = fin_lex['market_sentiment']
new_dict = dict(zip(word, sentiment))

In [12]:
vader.lexicon.update(new_dict)

In [13]:
df['Compound_NTUSD'] = df['clean_news'].apply(lambda x:vader.polarity_scores(x)['compound'])    

In [14]:
df['Compound_NTUSD_pos'] = df['clean_news'].apply(lambda x:vader.polarity_scores(x)['pos'])   
df['Compound_NTUSD_neu'] = df['clean_news'].apply(lambda x:vader.polarity_scores(x)['neu'])   
df['Compound_NTUSD_neg'] = df['clean_news'].apply(lambda x:vader.polarity_scores(x)['neg'])   

In [15]:
df['Sentiment_NTUSD'] = 'Neutral'
df.loc[df['Compound_NTUSD'] <= -0.06, 'Sentiment_NTUSD'] = 'Negative'
df.loc[df['Compound_NTUSD'] >= 0.06, 'Sentiment_NTUSD'] = 'Positive'

In [16]:
# The distribution of sentiments NTUSD-Fin
df.Sentiment_NTUSD.value_counts()

Positive    1850
Negative     132
Neutral        7
Name: Sentiment_NTUSD, dtype: int64

#### TextBlob

In [17]:
from textblob import TextBlob

In [18]:
def get_polarity(text):
    return TextBlob(text).sentiment.polarity

In [19]:
def get_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

In [20]:
df['Textblob_polarity'] = df['clean_news'].apply(lambda x: get_polarity(x))

In [21]:
df['Textblob_subjectivity'] = df['clean_news'].apply(lambda x: get_subjectivity(x))

In [22]:
df['Sentiment_Textblob'] = 'Neutral'
df.loc[df['Textblob_polarity'] <= -0.001, 'Sentiment_Textblob'] = 'Negative'
df.loc[df['Textblob_polarity'] >= 0.001, 'Sentiment_Textblob'] = 'Positive'

In [23]:
# The distribution of sentiments TextBlob
df.Sentiment_Textblob.value_counts()

Positive    1338
Negative     625
Neutral       26
Name: Sentiment_Textblob, dtype: int64

#### FinBERT 

In [24]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m118.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [25]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [26]:
from scipy.special import softmax
def polarity_scores_finbert(text):
  encoded_text = tokenizer.encode(text, padding = True, truncation = True,  return_tensors='pt')
  output = model(encoded_text)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)
  scores_dict = {
      'FinBert_pos' : scores[0],
      'FinBert_neg' : scores[1],
      'FinBert_neu' : scores[2]
  }
  return scores_dict


In [27]:
from tqdm import tqdm
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['clean_news']
        myid = row['Date']        
        finbert_result = polarity_scores_finbert(text)        
        res[myid] = finbert_result
    except RuntimeError:
        print(f'Broke on {myid}')

100%|██████████| 1989/1989 [40:56<00:00,  1.24s/it]


In [28]:
def sentiment_finbert(pos,neg,neu):    
  sent = 'Neutral'
  if pos > neg:
    if pos > neu:
      sent = 'Positive'

  else:
    if neg > neu:    
      sent = 'Negative'

  return sent

In [29]:
results_finbert = pd.DataFrame(res).T
results_finbert = results_finbert.reset_index().rename(columns={'index': 'Date'})

In [30]:
vect_sent_finbert = np.vectorize(sentiment_finbert)

In [31]:
results_finbert['Sentiment_FinBert'] = vect_sent_finbert(results_finbert['FinBert_pos'],results_finbert['FinBert_neg'],results_finbert['FinBert_neu'])

In [32]:
results_df = df.merge(results_finbert, on='Date')

In [33]:
results_df.Sentiment_FinBert.value_counts()

Neutral     1477
Negative     511
Positive       1
Name: Sentiment_FinBert, dtype: int64

In [34]:
results_df.head(3)

Unnamed: 0,Date,clean_news,news_stemmed,Label,Label_1day,Label_2day,Label_3day,Label_4day,Label_5day,Compound_NTUSD,...,Compound_NTUSD_neu,Compound_NTUSD_neg,Sentiment_NTUSD,Textblob_polarity,Textblob_subjectivity,Sentiment_Textblob,FinBert_pos,FinBert_neg,FinBert_neu,Sentiment_FinBert
0,2008-08-08,georgia downs two russian warplanes as countri...,"['georgia', 'down', 'two', 'russian', 'warplan...",0,0,0,0,0,0,0.9639,...,0.414,0.259,Positive,-0.050303,0.277104,Negative,0.024655,0.61502,0.360324,Negative
1,2008-08-11,why will not usa and nato help us if they will...,"['usa', 'nato', 'help', 'us', 'help', 'us', 'h...",1,0,0,0,0,0,0.0885,...,0.421,0.266,Positive,0.115398,0.395629,Positive,0.029502,0.447413,0.523085,Neutral
2,2008-08-12,remember that adorable yearold who sang at the...,"['rememb', 'ador', 'yearold', 'sang', 'open', ...",0,1,1,0,1,0,0.8511,...,0.522,0.208,Positive,-0.044302,0.536234,Negative,0.03339,0.234335,0.732276,Neutral


In [35]:
results_df.to_csv('news_sentiment_analysis.csv',sep=',', encoding='utf-8',index=False) 

In [36]:
results_df.to_csv('/content/drive/MyDrive/Colab Notebooks/dataset/news_sent_analysis_results.csv',sep=',', encoding='utf-8',index=False) 