# Sentiment Analysis Models

In [1]:
import os
import sys

# This adds the 'src' directory to the Python path.
src_rel_path = '../src'
src_abs_path = os.path.abspath(src_rel_path)

if src_abs_path not in sys.path:
    sys.path.append(src_abs_path)
    print(f"Added '{src_abs_path}' to sys.path")
else:
    print(f"'{src_abs_path}' is already in sys.path")

import pandas as pd
from tqdm import tqdm

import database as db

Added '/Users/shubxam/Code/Nifty-500-Live-Sentiment-Analysis/src' to sys.path


In [2]:
conn = db.DatabaseManager()
articles_df = conn.get_articles(has_sentiment=False, n=500)

In [3]:
headlines = articles_df['headline'].to_list()
headlines

['ICICI Bank Completes Redemption of Notes',
 'Mahindra & Mahindra selects partner for supply chain efficiencies',
 'Kotak Mahindra Bank Ltd (BOM:500247) Q2 2025 Earnings Call Highlights: Strong Profit Growth ...',
 'StanChart sells India personal loan business to Kotak Mahindra Bank',
 'Britannia, Groupe Bel India venture starts cheese production',
 'Indian shares post worst week in over 2 years as Mideast fears, foreign outflows weigh',
 "India's coal-fired monthly power output slips consecutively for the first time since pandemic",
 'HCL Technologies Ltd (BOM:532281) Q2 2025 Earnings Call Highlights: Strong Revenue Growth and ...',
 'Exploring Kaynes Technology India And 2 Other High Growth Tech Stocks',
 'High Growth Tech Stocks in India to Watch This September 2024',
 '3 Indian Dividend Stocks Yielding Up To 7.9%',
 'Top 3 Indian Dividend Stocks In September 2024',
 '3 High-Yield Dividend Stocks On Indian Exchange Yielding Up To 3.7%',
 'Sun Pharma and Moebius gain FDA fast track 

In [4]:
headlines.__len__()

111

## External APIs

### HuggingFace Inference API


In [5]:
from transformers.models.bert import BertForSequenceClassification, BertTokenizer
from transformers.pipelines import pipeline

#### kdave/FineTuned_Finbert

In [5]:
finbert = BertForSequenceClassification.from_pretrained(
    'kdave/FineTuned_Finbert',
    num_labels=3,
    subfolder='finbert',  # Specify the subdirectory containing the model files
    use_safetensors=True,  # Use safe tensors
)

tokenizer = BertTokenizer.from_pretrained(
    'kdave/FineTuned_Finbert',
    subfolder='finbert',  # Also need this for the tokenizer
)

In [6]:
# set top_k=1 to get the most likely label or top_k=None to get all labels
# device=-1 means CPU
nlp = pipeline(
    'sentiment-analysis',
    model=finbert,
    tokenizer=tokenizer,
    device=-1,
    top_k=1,
    framework='pt',
)

Device set to use cpu


In [7]:
nlp_res = nlp(headlines, batch_size=512)

#### yiyanghkust/finbert-tone
Much Superior than kdave/FineTuned_Finbert

In [6]:
finbert = BertForSequenceClassification.from_pretrained(
    'yiyanghkust/finbert-tone',
    num_labels=3,
    use_safetensors=True,  # Use safe tensors
)

tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [8]:
# set top_k=1 to get the most likely label or top_k=None to get all labels
# device=-1 means CPU
nlp_finbert = pipeline(
    'sentiment-analysis',
    model=finbert,
    tokenizer=tokenizer,
    device=-1,
    top_k=None,
    framework='pt',
)

Device set to use cpu


In [9]:
# batch_size = 8 runs fastest on Github Runners
finbert_res = nlp_finbert(headlines, batch_size=8)

In [10]:
flattened_data_finbert: list[dict[str, float]] = []

for news_item_sentiment_list in tqdm(finbert_res, desc='Processing Sentiment'):
    news_item_sentiment_dict = {}
    for individual_label_dict in news_item_sentiment_list:
        news_item_sentiment_dict[individual_label_dict['label']] = (
            individual_label_dict['score']
        )
    flattened_data_finbert.append(news_item_sentiment_dict)

Processing Sentiment: 100%|██████████| 111/111 [00:00<00:00, 331600.96it/s]
Processing Sentiment: 100%|██████████| 111/111 [00:00<00:00, 331600.96it/s]


In [12]:
df_finbert = pd.DataFrame(flattened_data_finbert)
df_finbert['compound'] = (
    df_finbert.loc[:, 'Positive']
    .where(df_finbert['Positive'] > df_finbert['Negative'], -df_finbert['Negative'])
    .astype(float)
    .round(4)
)
df_finbert.head()

Unnamed: 0,Neutral,Positive,Negative,compound
0,0.9999366,3.494189e-05,2.852494e-05,0.0
1,0.9999599,3.006034e-05,9.977322e-06,0.0
2,1.405323e-08,1.0,6.688106e-09,1.0
3,0.9997968,2.64919e-07,0.000202985,-0.0002
4,0.9999524,4.143087e-06,4.33675e-05,-0.0


## Modern-FinBert

based on ModernBert architecture and not BERT. has 16x long context length than BERT hence can be also used for article-text as well as headlines.

In [13]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [14]:
mbert_model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path='beethogedeon/Modern-FinBERT',
    num_labels=3,
    use_safetensors=True,
)

mbert_tokenizer = AutoTokenizer.from_pretrained('answerdotai/ModernBERT-base')

In [15]:
nlp_mbert = pipeline(
    task='sentiment-analysis',
    model=mbert_model,
    tokenizer=mbert_tokenizer,
    device=-1,
    top_k=None,
    framework='pt',
)

Device set to use cpu


In [16]:
res_mbert = nlp_mbert(headlines, batch_size=8)

In [17]:
flattened_data_mbert: list[dict[str, float]] = []

for news_item_sentiment_list in tqdm(res_mbert, desc='Processing Sentiment'):
    news_item_sentiment_dict = {}
    for individual_label_dict in news_item_sentiment_list:
        news_item_sentiment_dict[individual_label_dict['label']] = (
            individual_label_dict['score']
        )
    flattened_data_mbert.append(news_item_sentiment_dict)

Processing Sentiment: 100%|██████████| 111/111 [00:00<00:00, 340576.26it/s]
Processing Sentiment: 100%|██████████| 111/111 [00:00<00:00, 340576.26it/s]


In [19]:
df_mbert = pd.DataFrame(flattened_data_mbert)
df_mbert['compound'] = (
    df_mbert.loc[:, 'positive']
    .where(df_mbert['positive'] > df_mbert['negative'], -df_mbert['negative'])
    .astype(float)
    .round(4)
)
df_mbert.head()

Unnamed: 0,neutral,negative,positive,compound
0,0.999999,8.9286e-07,1.839519e-07,-0.0
1,1.0,2.669601e-09,5.901054e-08,0.0
2,0.005828,0.0002367222,0.9939348,0.9939
3,0.999997,1.070623e-06,2.179626e-06,0.0
4,0.999998,1.303204e-08,1.698909e-06,0.0


#### comparison

for headlines, keep using the finbert_tone model. It's mostly slightly better than Modern-FinBert.

In [21]:
sentiment_df = pd.merge(
    df_finbert,
    df_mbert,
    left_index=True,
    right_index=True,
    suffixes=('_finbert', '_mbert'),
)
sentiment_df['headline'] = headlines[: len(sentiment_df)]

In [22]:
sentiment_df.head()

Unnamed: 0,Neutral,Positive,Negative,compound_finbert,neutral,negative,positive,compound_mbert,headline
0,0.9999366,3.494189e-05,2.852494e-05,0.0,0.999999,8.9286e-07,1.839519e-07,-0.0,ICICI Bank Completes Redemption of Notes
1,0.9999599,3.006034e-05,9.977322e-06,0.0,1.0,2.669601e-09,5.901054e-08,0.0,Mahindra & Mahindra selects partner for supply...
2,1.405323e-08,1.0,6.688106e-09,1.0,0.005828,0.0002367222,0.9939348,0.9939,Kotak Mahindra Bank Ltd (BOM:500247) Q2 2025 E...
3,0.9997968,2.64919e-07,0.000202985,-0.0002,0.999997,1.070623e-06,2.179626e-06,0.0,StanChart sells India personal loan business t...
4,0.9999524,4.143087e-06,4.33675e-05,-0.0,0.999998,1.303204e-08,1.698909e-06,0.0,"Britannia, Groupe Bel India venture starts che..."


In [23]:
sentiment_df.to_csv('finbert_sentiment.csv', index=False)

In [25]:
sentiment_df.rename(columns={'label': 'label_2', 'score': 'score_2'}, inplace=True)