# Obtain Sentiments for a Single Stock

## Get Data

In [2]:
import numpy as np
import pandas as pd
import os
# Progress bar lib
from tqdm.notebook import tqdm

pd.set_option("display.width", None)
pd.set_option("display.max_columns", None)

# Dataset paths
#
# If paths do not have any datasets, run huggingface_hub.hf_hub_download 
# with repo_id="Zihan1004/FNSPID", filename="Stock_news/nasdaq_exteral_data.csv, and repo_type="dataset"
# A similar process can be used for the All_external.csv dataset
# Both these datasets have not been fully scraped
nasdaq_path = "../../news_data/direct_data/nasdaq_external_data.csv"

# Stock ticker
tick = "NVDA"

# List of dataframes
nasdaq_df = []

# Chunk calculation
CHUNK_SIZE = 10000
NCHUNKS = 1555

# Chunk progress
chunk_number = 1

# Boolean tracking whether currently on ticker
# hit_tick = False
# passed_tick = False

# Progress bar
bar = tqdm(total=NCHUNKS, desc="Chunks Processed")

# ROWS may be adjusted if scraping occurs or original data is changed

# Read csv in chunks for lower memory usage
for chunk in pd.read_csv(
    nasdaq_path,
    usecols=["Date", "Stock_symbol", "Lsa_summary", "Luhn_summary", "Textrank_summary", "Lexrank_summary"],
    dtype=np.bytes_,
    chunksize=CHUNK_SIZE,
    # skiprows=lambda x: passed_tick
):

    # Start index at 0
    chunk = chunk.reset_index(drop=True)
    
    # Ticker column index
    tick_col = chunk.columns.get_loc("Stock_symbol")

    # Add function0--maybe less efficient but does a full scan:
    for row in range(chunk.shape[0]):
        if chunk.iloc[row, tick_col] == tick:
            nasdaq_df.append(chunk[chunk["Stock_symbol"] == tick])
            break
    # Add chunks of ticker news
    # if hit_tick and not passed_tick:
    #     passed_tick = chunk.iloc[chunk.shape[0] - 1, tick_col] != tick
    #     nasdaq_df.append(chunk) if not passed_tick else nasdaq_df.append(chunk["Stock_symbol" == tick])
    # else if not passed_tick:
    #     for row in range(chunk.shape[0]):
    #         if chunk.iloc[row, tick_col] == tick:
    #             nasdaq_df.append(chunk[chunk["Stock_symbol"] == tick])
    #             break

    bar.update(1)

# Make the dataframe
nasdaq_df = pd.concat(nasdaq_df)
nasdaq_df

Chunks Processed:   0%|          | 0/247 [00:00<?, ?it/s]

Unnamed: 0,Date,Stock_symbol,Lsa_summary,Luhn_summary,Textrank_summary,Lexrank_summary
4001,2023-12-16 20:00:00 UTC,NVDA,"Stock splits, meanwhile, continue to get atten...","Stock splits, meanwhile, continue to get atten...","Stock splits, meanwhile, continue to get atten...","Stock splits, meanwhile, continue to get atten..."
4002,2023-12-16 20:00:00 UTC,NVDA,"Recently, though, with everything having to be...","Recently, though, with everything having to be...","Recently, though, with everything having to be...","Recently, though, with everything having to be..."
4003,2023-12-16 20:00:00 UTC,NVDA,"In today's video, I discuss recent updates aff...","In today's video, I discuss recent updates aff...","In today's video, I discuss recent updates aff...","In today's video, I discuss recent updates aff..."
4004,2023-12-16 20:00:00 UTC,NVDA,"SMCI, with a market cap of $16 billion, is sma...","SMCI, with a market cap of $16 billion, is sma...","SMCI, with a market cap of $16 billion, is sma...","SMCI, with a market cap of $16 billion, is sma..."
4005,2023-12-16 19:00:00 UTC,NVDA,Alphabet is “expected to grow as fast as Micro...,Those include the Invesco QQQ Trust (QQQ) and ...,It’s the time of year when investors are treat...,QQQ and QQQM may not deliver comparable perfor...
...,...,...,...,...,...,...
9197,2011-03-08 00:00:00 UTC,NVDA,,,,
9198,2011-03-08 00:00:00 UTC,NVDA,,,,
9199,2011-03-07 00:00:00 UTC,NVDA,,,,
9200,2011-03-07 00:00:00 UTC,NVDA,,,,


In [None]:
# Check for redundant data
nasdaq_df.drop_duplicates(inplace=True)

# Initial amount of data
initial = nasdaq_df.size
print(f"Initial number of elements: {initial}")

# Check for missing data
nasdaq_df.dropna(inplace=True)
print(f"Post drop number of elements: {nasdaq_df.size}")

# Print data loss
print(f"Overall number of elements dropped (including missing elements): {initial - nasdaq_df.size}")

In [None]:
# Importing 🤗 pipeline
import torch
from transformers import pipeline

# Setting up 🤗 pipeline
finbert = pipeline(
    "sentiment-analysis",
    model="ProsusAI/finbert",
    tokenizer="ProsusAI/finbert",
    truncation=True,
    padding=True,
    device=0,
)

# Index tracking variables
lsa, luhn, textrank, lexrank = 0, 0, 0, 0

# Data functions for pipeline to score using
def nasdaq_lsa():
    for i in range(nasdaq_df.shape[0]):
        yield nasdaq_df.iloc[i]["Lsa_summary"]

def nasdaq_luhn():
    for i in range(nasdaq_df.shape[0]):
        yield nasdaq_df.iloc[i]["Luhn_summary"]

def nasdaq_text():
    for i in range(nasdaq_df.shape[0]):
        yield nasdaq_df.iloc[i]["Textrank_summary"]

def nasdaq_lex():
    for i in range(nasdaq_df.shape[0]):
        yield nasdaq_df.iloc[i]["Lexrank_summary"]

In [None]:
# Progress bar
from tqdm.notebook import tqdm

# Columns added
for bot in ["Lsa", "Luhn", "Textrank", "Lexrank"]:
    nasdaq_df[bot + "_sentiment"] = ""
    nasdaq_df[bot + "_confidence"] = ""

# Indexes of columns
lsa_sloc = nasdaq_df.columns.get_loc("Lsa_sentiment")
lsa_cloc = nasdaq_df.columns.get_loc("Lsa_confidence")
luhn_sloc = nasdaq_df.columns.get_loc("Luhn_sentiment")
luhn_cloc = nasdaq_df.columns.get_loc("Luhn_confidence")
textrank_sloc = nasdaq_df.columns.get_loc("Textrank_sentiment")
textrank_cloc = nasdaq_df.columns.get_loc("Textrank_confidence")
lexrank_sloc = nasdaq_df.columns.get_loc("Lexrank_sentiment")
lexrank_cloc = nasdaq_df.columns.get_loc("Lexrank_confidence")

# Lsa sentiment
for sentiment in tqdm(finbert(nasdaq_lsa()), total=nasdaq_df.shape[0], desc="LSA Sentiment using FinBERT"):
    nasdaq_df.iloc[lsa, lsa_sloc] = sentiment["label"]
    nasdaq_df.iloc[lsa, lsa_cloc] = sentiment["score"]
    lsa += 1

# Luhn sentiment
for sentiment in tqdm(finbert(nasdaq_lsa()), total=nasdaq_df.shape[0], desc="Luhn Sentiment using FinBERT"):
    nasdaq_df.iloc[luhn, luhn_sloc] = sentiment["label"]
    nasdaq_df.iloc[luhn, luhn_cloc] = sentiment["score"]
    luhn += 1

# Textrank sentiment
for sentiment in tqdm(finbert(nasdaq_lsa()), total=nasdaq_df.shape[0], desc="TextRank Sentiment using FinBERT"):
    nasdaq_df.iloc[textrank, textrank_sloc] = sentiment["label"]
    nasdaq_df.iloc[textrank, textrank_cloc] = sentiment["score"]
    textrank += 1

# Lexrank sentiment
for sentiment in tqdm(finbert(nasdaq_lsa()), total=nasdaq_df.shape[0], desc="LexRank Sentiment using FinBERT"):
    nasdaq_df.iloc[lexrank, lexrank_sloc] = sentiment["label"]
    nasdaq_df.iloc[lexrank, lexrank_cloc] = sentiment["score"]
    lexrank += 1

In [None]:
# Drop summaries now that sentiments have been assigned
nasdaq_df.drop(columns=["Lsa_summary", "Luhn_summary", "Textrank_summary", "Lexrank_summary"], inplace=True)
nasdaq_df.reset_index(drop=True, inplace=True)
nasdaq_df

In [None]:
# Export dataframe to csv
os.makedirs(f"../{tick.lower()}_data", exist_ok=True)
nasdaq_df.to_csv(f"../{tick.lower()}_data/sentiments.csv", index=False)

print("✅ Exported to csv at " + f"single_stock/{tick.lower()}_data/sentiments.csv")