# Stock Data Sentiment Analysis
### FinBERT for Sentiment
### LSA, Luhn, TextRank, and LexRank for summaries

## Get Data

In [1]:
import numpy as np
import pandas as pd

# Keeping dataframes within viewing window
pd.set_option("display.width", None)
pd.set_option("display.max_columns", None)

# Dataset paths
#
# If paths do not have any datasets, run huggingface_hub.hf_hub_download 
# with repo_id="Zihan1004/FNSPID", filename="Stock_news/nasdaq_exteral_data.csv, and repo_type="dataset"
# A similar process can be used for the All_external.csv dataset
# Both these datasets have not been fully scraped
nasdaq_path = "news_data/direct_data/nasdaq_external_data.csv"

# Set nrows to None to read all data
# Otherwise adjust nrows to change amount of data read
nasdaq_df = pd.read_csv(
    nasdaq_path,
    usecols=["Date", "Stock_symbol", "Lsa_summary", "Luhn_summary", "Textrank_summary", "Lexrank_summary"],
    dtype=np.bytes_
)

nasdaq_df

Unnamed: 0,Date,Stock_symbol,Lsa_summary,Luhn_summary,Textrank_summary,Lexrank_summary
0,2023-12-16 23:00:00 UTC,A,Because the $125.00 strike represents an appro...,The current analytical data (including greeks ...,Below is a chart showing the trailing twelve m...,"At Stock Options Channel, our YieldBoost formu..."
1,2023-12-12 00:00:00 UTC,A,"Fintel reports that on December 13, 2023, Wolf...","T. Rowe Price Investment Management holds 10,1...",Agilent Technologies Declares $0.24 Dividend O...,The projected annual revenue for Agilent Techn...
2,2023-12-12 00:00:00 UTC,A,"In recent trading, shares of Agilent Technolog...","In recent trading, shares of Agilent Technolog...",When a stock reaches the target an analyst has...,When a stock reaches the target an analyst has...
3,2023-12-07 00:00:00 UTC,A,"Per a Grand View Research report, the global m...","Notably, Agilent enhanced the BioTek Cytation ...","Agilent Technologies, Inc. Price and Consensus...","Notably, Agilent enhanced the BioTek Cytation ..."
4,2023-12-07 00:00:00 UTC,A,ProShares UltraPro Short QQQ (SQQQ) is -0.15 a...,"As reported by Zacks, the current mean recomme...","The total Pre-Market volume is currently 39,23...",The NASDAQ 100 Pre-Market Indicator is up 70.2...
...,...,...,...,...,...,...
15549294,2011-04-19 00:00:00 UTC,,,,,
15549295,2011-04-19 00:00:00 UTC,,,,,
15549296,2011-04-19 00:00:00 UTC,,,,,
15549297,2011-04-19 00:00:00 UTC,,,,,


## Stock News Sentiment Analysis using finbert

### NASDAQ External Data

In [2]:
# Check for redundant data
nasdaq_df.drop_duplicates(inplace=True)

# Initial amount of data
initial = nasdaq_df.size
print(f"Initial number of elements: {initial}")

# Check for missing data
nasdaq_df.dropna(inplace=True)
print(f"Post drop number of elements: {nasdaq_df.size}")

# Print data loss
print(f"Overall number of elements dropped (including missing elements): {initial - nasdaq_df.size}")

Initial number of elements: 42899670
Post drop number of elements: 14861412
Overall number of elements dropped (including missing elements): 28038258


In [6]:
# Importing 🤗 pipeline
import torch
from transformers import pipeline

# Setting up 🤗 pipeline
finbert = pipeline(
    "sentiment-analysis",
    model="ProsusAI/finbert",
    tokenizer="ProsusAI/finbert",
    truncation=True,
    padding=True,
    device=0,
)

# Index tracking variables
lsa, luhn, textrank, lexrank = 0, 0, 0, 0

# Data functions for pipeline to score using
def nasdaq_lsa():
    for i in range(nasdaq_df.shape[0]):
        yield nasdaq_df.iloc[i]["Lsa_summary"]

def nasdaq_luhn():
    for i in range(nasdaq_df.shape[0]):
        yield nasdaq_df.iloc[i]["Luhn_summary"]

def nasdaq_text():
    for i in range(nasdaq_df.shape[0]):
        yield nasdaq_df.iloc[i]["Textrank_summary"]

def nasdaq_lex():
    for i in range(nasdaq_df.shape[0]):
        yield nasdaq_df.iloc[i]["Lexrank_summary"]

Device set to use cuda:0


In [7]:
# Progress bar
from tqdm.notebook import tqdm

# Columns added
for bot in ["Lsa", "Luhn", "Textrank", "Lexrank"]:
    nasdaq_df[bot + "_sentiment"] = ""
    nasdaq_df[bot + "_confidence"] = ""

# Indexes of columns
lsa_sloc = nasdaq_df.columns.get_loc("Lsa_sentiment")
lsa_cloc = nasdaq_df.columns.get_loc("Lsa_confidence")
luhn_sloc = nasdaq_df.columns.get_loc("Luhn_sentiment")
luhn_cloc = nasdaq_df.columns.get_loc("Luhn_confidence")
textrank_sloc = nasdaq_df.columns.get_loc("Textrank_sentiment")
textrank_cloc = nasdaq_df.columns.get_loc("Textrank_confidence")
lexrank_sloc = nasdaq_df.columns.get_loc("Lexrank_sentiment")
lexrank_cloc = nasdaq_df.columns.get_loc("Lexrank_confidence")

# Lsa sentiment
for sentiment in tqdm(finbert(nasdaq_lsa()), total=nasdaq_df.shape[0], desc="LSA Sentiment using FinBERT"):
    nasdaq_df.iloc[lsa, lsa_sloc] = sentiment["label"]
    nasdaq_df.iloc[lsa, lsa_cloc] = sentiment["score"]
    lsa += 1

# Luhn sentiment
for sentiment in tqdm(finbert(nasdaq_lsa()), total=nasdaq_df.shape[0], desc="Luhn Sentiment using FinBERT"):
    nasdaq_df.iloc[luhn, luhn_sloc] = sentiment["label"]
    nasdaq_df.iloc[luhn, luhn_cloc] = sentiment["score"]
    luhn += 1

# Textrank sentiment
for sentiment in tqdm(finbert(nasdaq_lsa()), total=nasdaq_df.shape[0], desc="TextRank Sentiment using FinBERT"):
    nasdaq_df.iloc[textrank, textrank_sloc] = sentiment["label"]
    nasdaq_df.iloc[textrank, textrank_cloc] = sentiment["score"]
    textrank += 1

# Lexrank sentiment
for sentiment in tqdm(finbert(nasdaq_lsa()), total=nasdaq_df.shape[0], desc="LexRank Sentiment using FinBERT"):
    nasdaq_df.iloc[lexrank, lexrank_sloc] = sentiment["label"]
    nasdaq_df.iloc[lexrank, lexrank_cloc] = sentiment["score"]
    lexrank += 1

LSA Sentiment using FinBERT:   0%|          | 0/2476902 [00:00<?, ?it/s]

Luhn Sentiment using FinBERT:   0%|          | 0/2476902 [00:00<?, ?it/s]

TextRank Sentiment using FinBERT:   0%|          | 0/2476902 [00:00<?, ?it/s]

LexRank Sentiment using FinBERT:   0%|          | 0/2476902 [00:00<?, ?it/s]

In [None]:
# Drop summaries now that sentiments have been assigned
nasdaq_df.drop(columns=["Lsa_summary", "Luhn_summary", "Textrank_summary", "Lexrank_summary"], inplace=True)
nasdaq_df

## Export to CSV

In [None]:
import os

# Makes directory SentimentDataSplit in directory NewsData if it does not already exist
os.makedirs("news_data/sentiment_data", exist_ok=True)

# To processed csv file
nasdaq_df.to_csv("news_data/sentiment_data/nasdaq.csv", index=False)

In [8]:
nasdaq_df

Unnamed: 0,Date,Stock_symbol,Lsa_summary,Luhn_summary,Textrank_summary,Lexrank_summary,Lsa_sentiment,Lsa_confidence,Luhn_sentiment,Luhn_confidence,Textrank_sentiment,Textrank_confidence,Lexrank_sentiment,Lexrank_confidence
0,2023-12-16 23:00:00 UTC,A,Because the $125.00 strike represents an appro...,The current analytical data (including greeks ...,Below is a chart showing the trailing twelve m...,"At Stock Options Channel, our YieldBoost formu...",neutral,0.868268,neutral,0.868268,neutral,0.868268,neutral,0.868268
1,2023-12-12 00:00:00 UTC,A,"Fintel reports that on December 13, 2023, Wolf...","T. Rowe Price Investment Management holds 10,1...",Agilent Technologies Declares $0.24 Dividend O...,The projected annual revenue for Agilent Techn...,neutral,0.744886,neutral,0.744886,neutral,0.744886,neutral,0.744886
2,2023-12-12 00:00:00 UTC,A,"In recent trading, shares of Agilent Technolog...","In recent trading, shares of Agilent Technolog...",When a stock reaches the target an analyst has...,When a stock reaches the target an analyst has...,neutral,0.907343,neutral,0.907343,neutral,0.907343,neutral,0.907343
3,2023-12-07 00:00:00 UTC,A,"Per a Grand View Research report, the global m...","Notably, Agilent enhanced the BioTek Cytation ...","Agilent Technologies, Inc. Price and Consensus...","Notably, Agilent enhanced the BioTek Cytation ...",positive,0.737881,positive,0.737881,positive,0.737881,positive,0.737881
4,2023-12-07 00:00:00 UTC,A,ProShares UltraPro Short QQQ (SQQQ) is -0.15 a...,"As reported by Zacks, the current mean recomme...","The total Pre-Market volume is currently 39,23...",The NASDAQ 100 Pre-Market Indicator is up 70.2...,negative,0.651382,negative,0.651382,negative,0.651382,negative,0.651382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2491780,2013-03-14 00:00:00 UTC,ZYXI,"By Brian Marckx, CFA Q4 2012 Financial Results...","By Brian Marckx, CFA Q4 2012 Financial Results...","By Brian Marckx, CFA Q4 2012 Financial Results...","By Brian Marckx, CFA Q4 2012 Financial Results...",negative,0.556462,negative,0.556462,negative,0.556462,negative,0.556462
2491781,2012-11-07 00:00:00 UTC,ZYXI,Maintaining Outperform rating and $1.35/share ...,Maintaining Outperform rating and $1.35/share ...,Maintaining Outperform rating and $1.35/share ...,Maintaining Outperform rating and $1.35/share ...,neutral,0.916666,neutral,0.916666,neutral,0.916666,neutral,0.916666
2491782,2012-08-08 00:00:00 UTC,ZYXI,Q2 2012 Financial Results: Strong Revenue and ...,Q2 2012 Financial Results: Strong Revenue and ...,Q2 2012 Financial Results: Strong Revenue and ...,Q2 2012 Financial Results: Strong Revenue and ...,neutral,0.899106,neutral,0.899106,neutral,0.899106,neutral,0.899106
2491783,2012-05-14 00:00:00 UTC,ZYXI,"35% Revenue Growth in Q1 Brian Marckx, CFA Rev...","35% Revenue Growth in Q1 Brian Marckx, CFA Rev...","35% Revenue Growth in Q1 Brian Marckx, CFA Rev...","35% Revenue Growth in Q1 Brian Marckx, CFA Rev...",neutral,0.69937,neutral,0.69937,neutral,0.69937,neutral,0.69937
