# Stock Data Sentiment Analysis Shared
### This will enable splitting up the dataset for faster processing.

## Get Data

In [26]:
import numpy as np
import pandas as pd

pd.set_option("display.width", None)
pd.set_option("display.max_columns", None)

# Dataset paths
#
# If paths do not have any datasets, run huggingface_hub.hf_hub_download 
# with repo_id="Zihan1004/FNSPID", filename="Stock_news/nasdaq_exteral_data.csv, and repo_type="dataset"
# A similar process can be used for the All_external.csv dataset
# Both these datasets have not been fully scraped
nasdaq_path = "../news_data/splits/nasdaq_external_data_1.csv"

# Set nrows to None to read all data
# Otherwise adjust nrows to change amount of data read
nasdaq_df = pd.read_csv(
    nasdaq_path,
    usecols=["Date", "Stock_symbol", "Lsa_summary", "Luhn_summary", "Textrank_summary", "Lexrank_summary"],
    dtype=np.bytes_
)

In [27]:
nasdaq_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Article_title,Stock_symbol,Url,Publisher,Author,Article,Lsa_summary,Luhn_summary,Textrank_summary,Lexrank_summary
0,0.0,2023-12-16 23:00:00 UTC,Interesting A Put And Call Options For August ...,A,https://www.nasdaq.com/articles/interesting-a-...,,,"Investors in Agilent Technologies, Inc. (Symbo...",Because the $125.00 strike represents an appro...,The current analytical data (including greeks ...,Below is a chart showing the trailing twelve m...,"At Stock Options Channel, our YieldBoost formu..."
1,1.0,2023-12-12 00:00:00 UTC,Wolfe Research Initiates Coverage of Agilent T...,A,https://www.nasdaq.com/articles/wolfe-research...,,,"Fintel reports that on December 13, 2023, Wolf...","Fintel reports that on December 13, 2023, Wolf...","T. Rowe Price Investment Management holds 10,1...",Agilent Technologies Declares $0.24 Dividend O...,The projected annual revenue for Agilent Techn...
2,2.0,2023-12-12 00:00:00 UTC,Agilent Technologies Reaches Analyst Target Price,A,https://www.nasdaq.com/articles/agilent-techno...,,,"In recent trading, shares of Agilent Technolog...","In recent trading, shares of Agilent Technolog...","In recent trading, shares of Agilent Technolog...",When a stock reaches the target an analyst has...,When a stock reaches the target an analyst has...
3,3.0,2023-12-07 00:00:00 UTC,Agilent (A) Enhances BioTek Cytation C10 With ...,A,https://www.nasdaq.com/articles/agilent-a-enha...,,,Agilent Technologies A is enhancing its BioTek...,"Per a Grand View Research report, the global m...","Notably, Agilent enhanced the BioTek Cytation ...","Agilent Technologies, Inc. Price and Consensus...","Notably, Agilent enhanced the BioTek Cytation ..."
4,4.0,2023-12-07 00:00:00 UTC,"Pre-Market Most Active for Dec 7, 2023 : SQQQ,...",A,https://www.nasdaq.com/articles/pre-market-mos...,,,The NASDAQ 100 Pre-Market Indicator is up 70.2...,ProShares UltraPro Short QQQ (SQQQ) is -0.15 a...,"As reported by Zacks, the current mean recomme...","The total Pre-Market volume is currently 39,23...",The NASDAQ 100 Pre-Market Indicator is up 70.2...


## Stock News Sentiment Analysis using finbert

### NASDAQ External Data

In [28]:
# Unnamed, Url, Publisher, and Author columns do not influence the stock price
# The actual articles themselves are usually too large to input into finbert, so the four summaries will be used
# Below line already done in split.ipynb
# nasdaq_df.drop(columns=["Unnamed: 0", "Url", "Publisher", "Author", "Article"], inplace=True)
nasdaq_df

Unnamed: 0,Date,Article_title,Stock_symbol,Lsa_summary,Luhn_summary,Textrank_summary,Lexrank_summary
0,2023-12-16 23:00:00 UTC,Interesting A Put And Call Options For August ...,A,Because the $125.00 strike represents an appro...,The current analytical data (including greeks ...,Below is a chart showing the trailing twelve m...,"At Stock Options Channel, our YieldBoost formu..."
1,2023-12-12 00:00:00 UTC,Wolfe Research Initiates Coverage of Agilent T...,A,"Fintel reports that on December 13, 2023, Wolf...","T. Rowe Price Investment Management holds 10,1...",Agilent Technologies Declares $0.24 Dividend O...,The projected annual revenue for Agilent Techn...
2,2023-12-12 00:00:00 UTC,Agilent Technologies Reaches Analyst Target Price,A,"In recent trading, shares of Agilent Technolog...","In recent trading, shares of Agilent Technolog...",When a stock reaches the target an analyst has...,When a stock reaches the target an analyst has...
3,2023-12-07 00:00:00 UTC,Agilent (A) Enhances BioTek Cytation C10 With ...,A,"Per a Grand View Research report, the global m...","Notably, Agilent enhanced the BioTek Cytation ...","Agilent Technologies, Inc. Price and Consensus...","Notably, Agilent enhanced the BioTek Cytation ..."
4,2023-12-07 00:00:00 UTC,"Pre-Market Most Active for Dec 7, 2023 : SQQQ,...",A,ProShares UltraPro Short QQQ (SQQQ) is -0.15 a...,"As reported by Zacks, the current mean recomme...","The total Pre-Market volume is currently 39,23...",The NASDAQ 100 Pre-Market Indicator is up 70.2...
...,...,...,...,...,...,...,...
95,2023-05-24 00:00:00 UTC,B of A Securities Maintains Agilent Technologi...,A,A payout ratio greater than one means the comp...,"In it's prior filing, the firm reported owning...",Agilent Technologies Declares $0.22 Dividend O...,The projected annual revenue for Agilent Techn...
96,2023-05-24 00:00:00 UTC,Wells Fargo Maintains Agilent Technologies (A)...,A,A payout ratio greater than one means the comp...,"In it's prior filing, the firm reported owning...",Agilent Technologies Declares $0.22 Dividend O...,The projected annual revenue for Agilent Techn...
97,2023-05-24 00:00:00 UTC,Baird Maintains Agilent Technologies (A) Outpe...,A,A payout ratio greater than one means the comp...,"In it's prior filing, the firm reported owning...",Agilent Technologies Declares $0.22 Dividend O...,The projected annual revenue for Agilent Techn...
98,2023-05-24 00:00:00 UTC,Why Agilent Technologies Stock Is Sinking Today,A,"It posted net income of $302 million, or $1.02...",The company expects fiscal Q3 non-GAAP earning...,Agilent projects fiscal Q3 revenue of between ...,The company's non-GAAP earnings were $377 mill...


In [29]:
# Below lines already done in split.ipynb
# Check for redundant data
# nasdaq_df.drop_duplicates(inplace=True)

# # Initial amount of data
# initial = nasdaq_df.size
# print(f"Initial number of elements: {initial}")

# # Check for missing data
# nasdaq_df.dropna(inplace=True)
# print(f"Post drop number of elements: {nasdaq_df.size}")

# # Print data loss
# print(f"Overall number of elements dropped (including missing elements): {initial - nasdaq_df.size}")

Initial number of elements: 700
Post drop number of elements: 700
Overall number of elements dropped (including missing elements): 0


In [37]:
# Importing 🤗 pipeline
import torch
from transformers import pipeline

# Setting up 🤗 pipeline
finbert = pipeline(
    "sentiment-analysis",
    model="ProsusAI/finbert",
    tokenizer="ProsusAI/finbert",
    truncation=True,
    padding=True,
    device=0,
)

# Index tracking variables
lsa, luhn, textrank, lexrank = 0, 0, 0, 0

# Data functions for pipeline to score using
def nasdaq_lsa():
    for i in range(nasdaq_df.shape[0]):
        yield nasdaq_df.iloc[i]["Lsa_summary"]

def nasdaq_luhn():
    for i in range(nasdaq_df.shape[0]):
        yield nasdaq_df.iloc[i]["Luhn_summary"]

def nasdaq_text():
    for i in range(nasdaq_df.shape[0]):
        yield nasdaq_df.iloc[i]["Textrank_summary"]

def nasdaq_lex():
    for i in range(nasdaq_df.shape[0]):
        yield nasdaq_df.iloc[i]["Lexrank_summary"]

Device set to use cuda:0


In [31]:
# For showing progress
from tqdm.notebook import tqdm

# Columns added
for bot in ["Lsa", "Luhn", "Textrank", "Lexrank"]:
    nasdaq_df[bot + "_sentiment"] = ""
    nasdaq_df[bot + "_confidence"] = ""

# Indexes of columns
lsa_sloc = nasdaq_df.columns.get_loc("Lsa_sentiment")
lsa_cloc = nasdaq_df.columns.get_loc("Lsa_confidence")
luhn_sloc = nasdaq_df.columns.get_loc("Luhn_sentiment")
luhn_cloc = nasdaq_df.columns.get_loc("Luhn_confidence")
textrank_sloc = nasdaq_df.columns.get_loc("Textrank_sentiment")
textrank_cloc = nasdaq_df.columns.get_loc("Textrank_confidence")
lexrank_sloc = nasdaq_df.columns.get_loc("Lexrank_sentiment")
lexrank_cloc = nasdaq_df.columns.get_loc("Lexrank_confidence")

# Lsa sentiment
for sentiment in tqdm(finbert(nasdaq_lsa()), total=nasdaq_df.shape[0], desc="LSA Sentiment using FinBERT"):
    nasdaq_df.iloc[lsa, lsa_sloc] = sentiment["label"]
    nasdaq_df.iloc[lsa, lsa_cloc] = sentiment["score"]
    lsa += 1

# Luhn sentiment
for sentiment in tqdm(finbert(nasdaq_lsa()), total=nasdaq_df.shape[0], desc="Luhn Sentiment using FinBERT"):
    nasdaq_df.iloc[luhn, luhn_sloc] = sentiment["label"]
    nasdaq_df.iloc[luhn, luhn_cloc] = sentiment["score"]
    luhn += 1
using
# Textrank sentiment
for sentiment in tqdm(finbert(nasdaq_lsa()), total=nasdaq_df.shape[0], desc="TextRank Sentiment using FinBERT"):
    nasdaq_df.iloc[textrank, textrank_sloc] = sentiment["label"]
    nasdaq_df.iloc[textrank, textrank_cloc] = sentiment["score"]
    textrank += 1

# Lexrank sentiment
for sentiment in tqdm(finbert(nasdaq_lsa()), total=nasdaq_df.shape[0], desc="LexRank Sentiment using FinBERT"):
    nasdaq_df.iloc[lexrank, lexrank_sloc] = sentiment["label"]
    nasdaq_df.iloc[lexrank, lexrank_cloc] = sentiment["score"]
    lexrank += 1

100it [00:01, 95.90it/s]
100it [00:01, 94.39it/s]
100it [00:01, 95.92it/s]
100it [00:01, 96.52it/s]


In [32]:
# Drop summaries now that sentiments have been assigned
nasdaq_df.drop(columns=["Lsa_summary", "Luhn_summary", "Textrank_summary", "Lexrank_summary"], inplace=True)
nasdaq_df

Unnamed: 0,Date,Article_title,Stock_symbol,Lsa_sentiment,Lsa_confidence,Luhn_sentiment,Luhn_confidence,Textrank_sentiment,Textrank_confidence,Lexrank_sentiment,Lexrank_confidence
0,2023-12-16 23:00:00 UTC,Interesting A Put And Call Options For August ...,A,neutral,0.868268,neutral,0.868268,neutral,0.868268,neutral,0.868268
1,2023-12-12 00:00:00 UTC,Wolfe Research Initiates Coverage of Agilent T...,A,neutral,0.744886,neutral,0.744886,neutral,0.744886,neutral,0.744886
2,2023-12-12 00:00:00 UTC,Agilent Technologies Reaches Analyst Target Price,A,neutral,0.907343,neutral,0.907343,neutral,0.907343,neutral,0.907343
3,2023-12-07 00:00:00 UTC,Agilent (A) Enhances BioTek Cytation C10 With ...,A,positive,0.737881,positive,0.737881,positive,0.737881,positive,0.737881
4,2023-12-07 00:00:00 UTC,"Pre-Market Most Active for Dec 7, 2023 : SQQQ,...",A,negative,0.651382,negative,0.651382,negative,0.651382,negative,0.651382
...,...,...,...,...,...,...,...,...,...,...,...
95,2023-05-24 00:00:00 UTC,B of A Securities Maintains Agilent Technologi...,A,neutral,0.937003,neutral,0.937003,neutral,0.937003,neutral,0.937003
96,2023-05-24 00:00:00 UTC,Wells Fargo Maintains Agilent Technologies (A)...,A,neutral,0.937003,neutral,0.937003,neutral,0.937003,neutral,0.937003
97,2023-05-24 00:00:00 UTC,Baird Maintains Agilent Technologies (A) Outpe...,A,neutral,0.937003,neutral,0.937003,neutral,0.937003,neutral,0.937003
98,2023-05-24 00:00:00 UTC,Why Agilent Technologies Stock Is Sinking Today,A,negative,0.48172,negative,0.48172,negative,0.48172,negative,0.48172


## Export to CSV

In [33]:
import os

# Makes directory SentimentDataSplit in directory NewsData if it does not already exist
os.makedirs("news_data/sentiment_data_split", exist_ok=True)

# Can change the number to change the split
nasdaq_df.to_csv("../news_data/sentiment_data_split/nasdaq_data_1.csv", index=False)