In [3]:
from datasets import load_dataset,Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoTokenizer
)
import torch
import numpy as np
import evaluate
import torch

In [4]:
df = pd.read_csv("datasets/financial_phrasebank.csv")

df.head()

Unnamed: 0,label,title
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [5]:
df = df.rename(columns = {'title':'text'})

df['label'] = df['label'].str.lower().str.strip()
valid_labels = ['positive','neutral','negative']
df = df[df['label'].isin(valid_labels)].reset_index(drop=True)

In [6]:
df.head()

Unnamed: 0,label,text
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [7]:
ENTITY_TOKEN = '[ENTITY]'
label_map = {'positive':0,'neutral':1,'negative':2}

nli_data=[]
for _, row in tqdm(df.iterrows(),total=len(df)):
    premise = str(row['text']).strip()
    gold_label=row['label']

    for sentiment in ['positive','neutral','negative']:
        hypothesis = f"The News is {sentiment} for {ENTITY_TOKEN}."
        label = 0 if sentiment == gold_label else (1 if sentiment == "neutral" else 2)
        nli_data.append({
            "premise":premise,
            "hypothesis":hypothesis,
            "label":label
        })

df_nli = pd.DataFrame(nli_data)

print(f"NLI dataset ready: {len(df_nli)} samples generated from {len(df)} base sentences.")
print(df_nli.sample(5))

100%|██████████| 4845/4845 [00:00<00:00, 5845.68it/s]

NLI dataset ready: 14535 samples generated from 4845 base sentences.
                                                 premise  \
2963   The sale , which will result in a gain of some...   
13104  Finnish industrial group Ruukki Group has brou...   
8121      Yvonne Jones is owner of Chameleon Interiors .   
12008  Finnish media group Talentum has issued a prof...   
7116   Finnish food industry company L+Ænnen Tehtaat ...   

                               hypothesis  label  
2963   The News is negative for [ENTITY].      2  
13104  The News is positive for [ENTITY].      2  
8121   The News is positive for [ENTITY].      2  
12008  The News is negative for [ENTITY].      0  
7116   The News is positive for [ENTITY].      2  





In [9]:
df_nli.to_csv("processed/NLI_dataset.csv", index=False)
print("Saved NLI_dataset:", len(df))

Saved NLI_dataset: 4845


In [10]:
df_nli.head()

Unnamed: 0,premise,hypothesis,label
0,Technopolis plans to develop in stages an area...,The News is positive for [ENTITY].,2
1,Technopolis plans to develop in stages an area...,The News is neutral for [ENTITY].,0
2,Technopolis plans to develop in stages an area...,The News is negative for [ENTITY].,2
3,The international electronic industry company ...,The News is positive for [ENTITY].,2
4,The international electronic industry company ...,The News is neutral for [ENTITY].,1


## Tokenization & Dataset Setup

In [11]:
df_nli = pd.read_csv('processed/NLI_dataset.csv')

MODEL_NAME = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_df,val_df = train_test_split(df_nli,test_size = 0.1, random_state = 42,stratify=df_nli['label'])

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [12]:
def tokenize_fn(batch):
    return tokenizer(
        batch['premise'],
        batch['hypothesis'],
        truncation=True,
        padding='max_length',
        max_length=256
    )

In [13]:
train_ds = train_ds.map(tokenize_fn,batched=True)
val_ds = val_ds.map(tokenize_fn,batched=True)

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

print("Tokenization complete: ")
print('Train samples:', len(train_ds))
print('Validation samples:', len(val_ds))


Map:   0%|          | 0/13081 [00:00<?, ? examples/s]

Map:   0%|          | 0/1454 [00:00<?, ? examples/s]

Tokenization complete: 
Train samples: 13081
Validation samples: 1454


In [14]:
example = tokenizer.decode(train_ds[0]["input_ids"])
print("\nSample tokenized example:\n", example[:400])


Sample tokenized example:
 [CLS] The net sales of the whole fiscal year 2008 will be lower than in 2007 and operating profit is estimated to be negative .[SEP] The News is positive for [ENTITY].[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PA


## Model Fine-Tune


In [15]:
MODEL_NAME = "microsoft/deberta-v3-base"

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    force_download=True,
    cache_dir="./hf_cache"

)

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [16]:
def compute_metrics(eval_pred):
    logits,labels = eval_pred
    preds = np.argmax(logits,axis=-1)
    return{
        "accuracy":accuracy.compute(predictions=preds,references=labels)['accuracy'],
        "f1_weighted":f1.compute(predictions=preds,references=labels,average="weighted")['f1']
    }

In [19]:
training_args = TrainingArguments(
    output_dir="model_checkpoints",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=50,
    logging_strategy="steps",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
results = trainer.evaluate(val_ds)
print(results)


  trainer = Trainer(


Step,Training Loss
50,0.7817
100,0.6419
150,0.5841
200,0.5757
250,0.5651
300,0.5075
350,0.4313
400,0.4235
450,0.4799
500,0.3624


{'eval_loss': 0.19134274125099182, 'eval_accuracy': 0.9552957359009628, 'eval_f1_weighted': 0.955325050718799, 'eval_runtime': 30.9055, 'eval_samples_per_second': 47.047, 'eval_steps_per_second': 5.889, 'epoch': 2.0}


In [20]:
trainer.save_model("startup_zero_shot_v2")
tokenizer.save_pretrained("startup_zero_shot_v2")

('startup_zero_shot_v2/tokenizer_config.json',
 'startup_zero_shot_v2/special_tokens_map.json',
 'startup_zero_shot_v2/spm.model',
 'startup_zero_shot_v2/added_tokens.json',
 'startup_zero_shot_v2/tokenizer.json')

In [24]:
import torch
import torch.nn.functional as F

label_map = {0: "positive", 1: "neutral", 2: "negative"}

def predict_single_company(text, company):
    hypotheses = [
        f"The news is positive for {company}.",
        f"The news is neutral for {company}.",
        f"The news is negative for {company}.",
    ]

    inputs = tokenizer(
        [text] * 3,
        hypotheses,
        return_tensors="pt",
        truncation=True,
        padding='max_length',
        max_length=256
    ).to("cuda")

    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=1)

    entail_scores = probs[:, 0].cpu().numpy().tolist()
    sentiments = ["positive", "neutral", "negative"]
    best = sentiments[int(torch.argmax(torch.tensor(entail_scores)))]

    return {
        "message": f"The news is {best} for {company}",
        "positive": round(entail_scores[0], 4),
        "neutral": round(entail_scores[1], 4),
        "negative": round(entail_scores[2], 4)
    }


In [25]:
text = "Swiggy beats Zomato stocks after major funding news"
print(predict_single_company(text, "Swiggy"))
print(predict_single_company(text, "Zomato"))

{'message': 'The news is positive for Swiggy', 'positive': 0.9388, 'neutral': 0.2174, 'negative': 0.0006}
{'message': 'The news is positive for Zomato', 'positive': 0.8933, 'neutral': 0.3756, 'negative': 0.0006}


In [26]:
import random
import pandas as pd

COMPANIES = ["Swiggy", "Zomato", "Ola", "Uber", "Flipkart", "Amazon"]

COMPETITIVE_TEMPLATES = [
    "{A} beats {B} in market share",
    "{A} surpasses {B} in revenue",
    "{A} outperforms {B} this quarter",
    "{B} lags behind {A} in growth",
    "{B} suffers losses while {A} expands",
    "{A} overtakes {B} in valuation",
    "{A} wins against {B} in competition",
    "{B} falls while {A} rises",
]

nli_rows = []

for template in COMPETITIVE_TEMPLATES:
    for A in COMPANIES:
        for B in COMPANIES:
            if A == B:
                continue

            premise = template.format(A=A, B=B)

            hypotheses = [
                (f"The news is positive for {A}.", 0),
                (f"The news is neutral for {A}.", 1),
                (f"The news is negative for {A}.", 2),
                (f"The news is positive for {B}.", 2),
                (f"The news is neutral for {B}.", 1),
                (f"The news is negative for {B}.", 0),
            ]

            for hyp, lab in hypotheses:
                nli_rows.append({
                    "premise": premise,
                    "hypothesis": hyp,
                    "label": lab
                })

df_rel = pd.DataFrame(nli_rows)
df_rel.shape, df_rel.head()


((1440, 3),
                                premise                        hypothesis  \
 0  Swiggy beats Zomato in market share  The news is positive for Swiggy.   
 1  Swiggy beats Zomato in market share   The news is neutral for Swiggy.   
 2  Swiggy beats Zomato in market share  The news is negative for Swiggy.   
 3  Swiggy beats Zomato in market share  The news is positive for Zomato.   
 4  Swiggy beats Zomato in market share   The news is neutral for Zomato.   
 
    label  
 0      0  
 1      1  
 2      2  
 3      2  
 4      1  )

In [27]:
df_final_nli = pd.concat([df_nli, df_rel], ignore_index=True)
df_final_nli = df_final_nli.sample(frac=1, random_state=42).reset_index(drop=True)

print("Total combined samples:", len(df_final_nli))
df_final_nli.head()


Total combined samples: 15975


Unnamed: 0,premise,hypothesis,label
0,Increased trust of our clients in YIT can be s...,The News is neutral for [ENTITY].,1
1,"The contract covers the manufacturing , surfac...",The News is positive for [ENTITY].,2
2,"New Delhi , Feb. 12 -- Korteniemi Anneli , Hel...",The News is positive for [ENTITY].,2
3,Operating loss amounted to EUR 0.7 mn compared...,The News is neutral for [ENTITY].,1
4,So Mr. Galvan made savings adjustments of his ...,The News is positive for [ENTITY].,2


In [28]:
train_df, val_df = train_test_split(df_final_nli, test_size=0.1, random_state=42, stratify=df_final_nli['label'])
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
train_ds = train_ds.map(tokenize_fn, batched=True)
val_ds = val_ds.map(tokenize_fn, batched=True)

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/14377 [00:00<?, ? examples/s]

Map:   0%|          | 0/1598 [00:00<?, ? examples/s]

In [29]:
training_args = TrainingArguments(
    output_dir="model_rel_checkpoints",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=50,
    logging_strategy="steps",
    save_total_limit=1,
    report_to="none",
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

trainer.train()
results = trainer.evaluate(val_ds)
print(results)

Step,Training Loss
50,0.2877
100,0.3363
150,0.3438
200,0.3406
250,0.247
300,0.2153
350,0.324
400,0.253
450,0.2453
500,0.318


{'eval_loss': 0.12083425372838974, 'eval_accuracy': 0.9774718397997497, 'eval_f1_weighted': 0.9774461623154557, 'eval_runtime': 12.5367, 'eval_samples_per_second': 127.466, 'eval_steps_per_second': 15.953, 'epoch': 1.0}


In [30]:
import torch
import torch.nn.functional as F

label_map = {0: "positive", 1: "neutral", 2: "negative"}

def predict_single_company(text, company):
    hypotheses = [
        f"The news is positive for {company}.",
        f"The news is neutral for {company}.",
        f"The news is negative for {company}.",
    ]

    inputs = tokenizer(
        [text] * 3,
        hypotheses,
        return_tensors="pt",
        truncation=True,
        padding='max_length',
        max_length=256
    ).to("cuda")

    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=1)

    entail_scores = probs[:, 0].cpu().numpy().tolist()
    sentiments = ["positive", "neutral", "negative"]
    best = sentiments[int(torch.argmax(torch.tensor(entail_scores)))]

    return {
        "message": f"The news is {best} for {company}",
        "positive": round(entail_scores[0], 4),
        "neutral": round(entail_scores[1], 4),
        "negative": round(entail_scores[2], 4)
    }


In [34]:
tests = [
    ("Ola faces major losses while Uber expands operations.", ["Ola", "Uber"]),
    ("Zomato overtakes Swiggy in market share after successful IPO.", ["Zomato", "Swiggy"]),
    ("Flipkart struggles as Amazon dominates the festive sale market.", ["Flipkart", "Amazon"]),
    ("PhonePe beats Google Pay as UPI leader in India.", ["PhonePe", "Google Pay"]),
    ("Byju's secures $250M funding to expand globally.", ["Byju's"]),
    ("Paytm faces regulatory action due to compliance failures.", ["Paytm"]),
    ("Tata Motors appoints new CFO to restructure its business.", ["Tata Motors"]),
    ("Apple announces a routine product update this year.", ["Apple"]),
    ("Adani shares rise but Reliance remains stable after budget announcement.", ["Adani", "Reliance"]),
    ("Meta stock falls even as Google reports strong earnings.", ["Meta", "Google"]),
]

for text, companies in tests:
    print("NEWS:", text)
    for c in companies:
        print(predict_single_company(text, c))
    print("-" * 50)


NEWS: Ola faces major losses while Uber expands operations.
{'message': 'The news is negative for Ola', 'positive': 0.0013, 'neutral': 0.0025, 'negative': 0.9957}
{'message': 'The news is positive for Uber', 'positive': 0.9928, 'neutral': 0.002, 'negative': 0.0014}
--------------------------------------------------
NEWS: Zomato overtakes Swiggy in market share after successful IPO.
{'message': 'The news is positive for Zomato', 'positive': 0.9875, 'neutral': 0.0026, 'negative': 0.0013}
{'message': 'The news is negative for Swiggy', 'positive': 0.0013, 'neutral': 0.0034, 'negative': 0.9958}
--------------------------------------------------
NEWS: Flipkart struggles as Amazon dominates the festive sale market.
{'message': 'The news is negative for Flipkart', 'positive': 0.0012, 'neutral': 0.0024, 'negative': 0.9953}
{'message': 'The news is neutral for Amazon', 'positive': 0.0015, 'neutral': 0.0019, 'negative': 0.0016}
--------------------------------------------------
NEWS: PhonePe beat

In [35]:
trainer.save_model("zero_shot_news_sentiment_analyzer_startups")
tokenizer.save_pretrained("zero_shot_news_sentiment_analyzer_startups")


('zero_shot_news_sentiment_analyzer_startups/tokenizer_config.json',
 'zero_shot_news_sentiment_analyzer_startups/special_tokens_map.json',
 'zero_shot_news_sentiment_analyzer_startups/spm.model',
 'zero_shot_news_sentiment_analyzer_startups/added_tokens.json',
 'zero_shot_news_sentiment_analyzer_startups/tokenizer.json')

In [37]:
!zip -r zero_shot_news_sentiment_analyzer_startups.zip zero_shot_news_sentiment_analyzer_startups

from google.colab import files
files.download("zero_shot_news_sentiment_analyzer_startups.zip")


  adding: zero_shot_news_sentiment_analyzer_startups/ (stored 0%)
  adding: zero_shot_news_sentiment_analyzer_startups/added_tokens.json (stored 0%)
  adding: zero_shot_news_sentiment_analyzer_startups/spm.model (deflated 50%)
  adding: zero_shot_news_sentiment_analyzer_startups/special_tokens_map.json (deflated 50%)
  adding: zero_shot_news_sentiment_analyzer_startups/training_args.bin (deflated 54%)
  adding: zero_shot_news_sentiment_analyzer_startups/model.safetensors (deflated 24%)
  adding: zero_shot_news_sentiment_analyzer_startups/tokenizer.json (deflated 77%)
  adding: zero_shot_news_sentiment_analyzer_startups/tokenizer_config.json (deflated 73%)
  adding: zero_shot_news_sentiment_analyzer_startups/config.json (deflated 55%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>