# Sentiment Analysis Techniques

In [2]:
import os
import sys


# This adds the 'src' directory to the Python path.
src_rel_path = "../src"
src_abs_path = os.path.abspath(src_rel_path)

if src_abs_path not in sys.path:
    sys.path.append(src_abs_path)
    print(f"Added '{src_abs_path}' to sys.path")
else:
    print(f"'{src_abs_path}' is already in sys.path")

import database as db
import pandas as pd
from tqdm import tqdm

Added '/Users/shubxam/Code/Nifty-500-Live-Sentiment-Analysis/src' to sys.path


In [3]:
conn = db.DatabaseManager()
articles_df = conn.get_articles(has_sentiment=False, n=500)

In [4]:
headlines = articles_df['headline'].to_list()
headlines

['Dividend stocks: HDFC Bank, Tata Elxsi, ICICI Bank, Infosys, others to declare dividend next week',
 'Shobana Kamineni bets on digital growth, Apollo 24/7 to turn profitable by fourth quarter of FY26',
 'Q4 results 2025: HDFC Bank, Yes Bank, Infosys, Wipro, others to declare earnings THIS week | Full list here',
 'HDFC Bank Q4 Results 2025 Date and Time: India’s Largest Lender to Announce Earnings Next Week',
 'Infosys (INFY) Expected to Announce Quarterly Earnings on Thursday',
 "India's non-coking coal imports drop 8% y-o-y in FY'25. Will the trend sustain?",
 "Soft prices, rising input costs likely to make a dent in cement companies' profits",
 'Wipro (WIT) Projected to Post Earnings on Wednesday',
 'Stock Market News, April 11, 2025: Stocks Rise Despite Recession Fears; Treasury Yield Surges',
 'Vodafone Idea, Suzlon Energy, Kotak Mahindra Bank & Bajaj Finance: What Jay Thakkar of ICICI Sec says on these 4 stocks',
 'Insider trading erodes investor confidence: Sanjiv Mehta',
 "Ma

## External APIs

### HuggingFace Inference API


In [1]:
from transformers.models.bert import BertTokenizer, BertForSequenceClassification
from transformers.pipelines import pipeline

#### kdave/FineTuned_Finbert

In [5]:
finbert = BertForSequenceClassification.from_pretrained(
    'kdave/FineTuned_Finbert',
    num_labels=3,
    subfolder='finbert',  # Specify the subdirectory containing the model files
    use_safetensors=True  # Use safe tensors
)

tokenizer = BertTokenizer.from_pretrained(
    'kdave/FineTuned_Finbert',
    subfolder='finbert'  # Also need this for the tokenizer
)

In [6]:
# set top_k=1 to get the most likely label or top_k=None to get all labels
# device=-1 means CPU
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer, device=-1, top_k=1, framework="pt")

Device set to use cpu


In [7]:
nlp_res = nlp(headlines, batch_size=512)

#### yiyanghkust/finbert-tone
Much Superior than kdave/FineTuned_Finbert

In [5]:
finbert_1 = BertForSequenceClassification.from_pretrained(
    'yiyanghkust/finbert-tone',
    num_labels=3,
    use_safetensors=True  # Use safe tensors
    
)

tokenizer_1 = BertTokenizer.from_pretrained(
    'yiyanghkust/finbert-tone'
)

In [6]:
# set top_k=1 to get the most likely label or top_k=None to get all labels
# device=-1 means CPU
nlp_1 = pipeline("sentiment-analysis", model=finbert_1, tokenizer=tokenizer_1, device=-1, top_k=None, framework="pt")

Device set to use cpu


In [7]:
# batch_size = 8 runs fastest on Github Runners
nlp_1_res = nlp_1(headlines, batch_size=8)

In [8]:
nlp_1_res

[[{'label': 'Neutral', 'score': 0.9999803304672241},
  {'label': 'Negative', 'score': 1.919074020406697e-05},
  {'label': 'Positive', 'score': 4.890857780992519e-07}],
 [{'label': 'Positive', 'score': 0.8673157691955566},
  {'label': 'Neutral', 'score': 0.07077459990978241},
  {'label': 'Negative', 'score': 0.061909619718790054}],
 [{'label': 'Neutral', 'score': 0.9999979734420776},
  {'label': 'Negative', 'score': 1.630397036933573e-06},
  {'label': 'Positive', 'score': 4.165348173046368e-07}],
 [{'label': 'Neutral', 'score': 0.999998927116394},
  {'label': 'Positive', 'score': 7.062882332320441e-07},
  {'label': 'Negative', 'score': 3.0752127599953383e-07}],
 [{'label': 'Neutral', 'score': 0.9999947547912598},
  {'label': 'Negative', 'score': 5.010918812331511e-06},
  {'label': 'Positive', 'score': 2.976799464704527e-07}],
 [{'label': 'Negative', 'score': 0.999468982219696},
  {'label': 'Positive', 'score': 0.0004762684111483395},
  {'label': 'Neutral', 'score': 5.471041367854923e-05

In [9]:
flattened_data: list[dict[str,float]] = []

for news_item_sentiment_list in tqdm(nlp_1_res, desc="Processing Sentiment"):
    news_item_sentiment_dict = {}
    for individual_label_dict in news_item_sentiment_list:
        news_item_sentiment_dict[individual_label_dict['label']] = individual_label_dict['score']
    flattened_data.append(news_item_sentiment_dict)

Processing Sentiment: 100%|██████████| 500/500 [00:00<00:00, 663865.78it/s]
Processing Sentiment: 100%|██████████| 500/500 [00:00<00:00, 663865.78it/s]


In [10]:
df = pd.DataFrame(flattened_data)
df.head()

Unnamed: 0,Neutral,Negative,Positive
0,0.99998,1.919074e-05,4.890858e-07
1,0.070775,0.06190962,0.8673158
2,0.999998,1.630397e-06,4.165348e-07
3,0.999999,3.075213e-07,7.062882e-07
4,0.999995,5.010919e-06,2.976799e-07


In [12]:
df.loc[:, "Positive"].where(df["Positive"] > df["Negative"], -df["Negative"]).astype(float).round(4)

0     -0.0000
1      0.8673
2     -0.0000
3      0.0000
4     -0.0000
        ...  
495    0.0004
496    0.0000
497    0.0004
498   -0.0002
499    0.0026
Name: Positive, Length: 500, dtype: float64

In [13]:
from utils import analyse_sentiment

analyse_sentiment(headlines)



#### comparison

In [28]:
flattened_data_0 = [item[0] for item in nlp_res]
sentiment_df_0 = pd.DataFrame(flattened_data_0)
flattened_data_1 = [item[0] for item in nlp_1_res]
sentiment_df_1 = pd.DataFrame(flattened_data_1)

sentiment_df = pd.merge(sentiment_df_0, sentiment_df_1, left_index=True, right_index=True, suffixes=('_finbert', '_finbert_1'))
sentiment_df['headline'] = headlines[:len(sentiment_df)]

In [31]:
sentiment_df.to_csv('finbert_sentiment.csv', index=False)

In [25]:
sentiment_df.rename(
    columns={
        'label': 'label_2',
        'score': 'score_2'
    },
    inplace=True
)