# DATA 622 | Homework 9

**Name:** Sanket Vijay Patil  
**Date:** November 2025  
**Campus ID** GW61258

In [17]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
from transformers import pipeline
import re, collections

In [20]:
url = "https://apnews.com/article/boeing-aviation-aircraft-air-india-crash-f12b20e65dc57ae655a1e0759b58938f"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

paragraphs = [p.get_text() for p in soup.find_all("p")]
article = "\n".join(paragraphs)
print("Article loaded successfully. Total length:", len(article), "characters\n")

Article loaded successfully. Total length: 3307 characters



In [8]:
import torch, math
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_id = "distilbert-base-uncased-finetuned-sst-2-english"
tok = AutoTokenizer.from_pretrained(model_id)
mdl = AutoModelForSequenceClassification.from_pretrained(model_id)
mdl.eval()

enc = tok(article, truncation=True, max_length=512, padding="max_length", return_tensors="pt")
with torch.no_grad():
    logits = mdl(**enc).logits[0]
probs = torch.softmax(logits, dim=-1).tolist()
labels = ["NEGATIVE", "POSITIVE"]
sent_label = labels[int(torch.argmax(logits))]
print(f"Raw: {dict(zip(labels, [round(p,4) for p in probs]))}")
print("Direct forward sentiment:", sent_label)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Raw: {'NEGATIVE': 0.9992, 'POSITIVE': 0.0008}
Direct forward sentiment: NEGATIVE


In [25]:
z = pipeline("zero-shot-classification", model="facebook/bart-large-mnli",
             device=-1, truncation=True, padding=True)
LABELS = ["technology","aviation","policy"]

def topics_zeroshot(text, labels=LABELS, chunk=900):
    text = re.sub(r"\s+"," ", text).strip()
    acc = collections.Counter({l:0.0 for l in labels}); n = 0
    for i in range(0, len(text), chunk):
        out = z(text[i:i+chunk], candidate_labels=labels, multi_label=True)
        for l, s in zip(out["labels"], out["scores"]): acc[l] += float(s)
        n += 1
    avg = {l: acc[l]/max(1,n) for l in labels}
    s = sum(avg.values()) or 1.0
    return {l: round(100*avg[l]/s, 2) for l in labels}

print("Zero-shot topics (%):", topics_zeroshot(article))


Device set to use cpu


Zero-shot topics (%): {'technology': 33.02, 'aviation': 50.52, 'policy': 16.46}


In [29]:
sent = pipeline("sentiment-analysis",
                model="cardiffnlp/twitter-roberta-base-sentiment-latest",
                device=-1, truncation=True, padding=True)

def sentiment_chunked(txt, step=900):
    txt = re.sub(r"\s+"," ", txt).strip()
    acc = collections.Counter({"negative":0.0,"neutral":0.0,"positive":0.0})
    n = 0
    for i in range(0, len(txt), step):
        r = sent(txt[i:i+step])[0]                 # {'label': 'Positive', 'score': ...}
        acc[r["label"].lower()] += float(r["score"])
        n += 1
    s = sum(acc.values()) or 1.0
    probs = {k: acc[k]/s for k in acc}
    return max(probs, key=probs.get), {k: round(v,3) for k,v in probs.items()}

label, probs = sentiment_chunked(article)
print("\nDL Sentiment:", label, probs)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



DL Sentiment (chunked): neutral {'negative': 0.229, 'neutral': 0.771, 'positive': 0.0}


In [33]:
#for fiinal summarization of results 
import re, collections
from transformers import pipeline

def _chunked(text, n=900):
    text = re.sub(r"\s+"," ", str(text)).strip()
    for i in range(0, len(text), n):
        yield text[i:i+n]

def sentiment_chunked(model_id, text, labels_expected):
    clf = pipeline("sentiment-analysis", model=model_id,
                   device=-1, truncation=True, padding=True)
    acc = collections.Counter({k:0.0 for k in labels_expected})
    seen = 0
    for c in _chunked(text, 900):
        out = clf(c)[0]                               
        lbl = out["label"].lower()
        if lbl not in acc:
            if "pos" in lbl: lbl = "positive"
            elif "neg" in lbl: lbl = "negative"
            else: lbl = "neutral"
        acc[lbl] += float(out["score"]); seen += 1
    s = sum(acc.values()) or 1.0
    probs = {k: acc[k]/s for k in acc}
    return max(probs, key=probs.get), probs

def fmt_prob_map(d):
    return ", ".join(f"{k}:{v:.2f}" for k,v in d.items())

#Recomputing the  sentiments
#SST-2 
llm_label, llm_probs = sentiment_chunked(
    "distilbert-base-uncased-finetuned-sst-2-english",
    article,
    labels_expected={"negative","positive","neutral"}
)

#Cardiff Twitter RoBERTa
dl_label, dl_probs = sentiment_chunked(
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
    article,
    labels_expected={"negative","neutral","positive"}
)

topic_print = []
try:
    if isinstance(topic_scores, dict) and "labels" in topic_scores:
        labs, scs = topic_scores["labels"], topic_scores["scores"]
        s = sum(scs) or 1.0
        topic_pct = {l: 100.0*float(sv)/s for l,sv in zip(labs, scs)}
        topic_print = sorted(topic_pct.items(), key=lambda x: -x[1])
    elif isinstance(topic_scores, dict):
        s = sum(topic_scores.values()) or 1.0
        topic_print = sorted(((k, 100.0*v/s) for k,v in topic_scores.items()), key=lambda x: -x[1])
except Exception:
    topic_print = []

#Emotions
top_emos_line = "n/a"
try:
    top_emos_line = ", ".join(f"{e['label']} ({e['score']:.2f})" for e in emo_sorted[:3])
except Exception:
    pass

#summary
print("\summarizing the comparison")
print(f"LLM Sentiment : {llm_label}  | {fmt_prob_map(llm_probs)}")
print(f"DL  Sentiment : {dl_label}   | {fmt_prob_map(dl_probs)}")
print(f"\nTop Emotions  : {top_emos_line}")

print("\nTopic Relevance (%):")
if topic_print:
    for k,v in topic_print:
        print(f"  {k:<10} {v:6.1f}%")
else:
    print("  n/a")


Device set to use cpu
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


\summarizing the comparison
LLM Sentiment : negative  | negative:0.76, positive:0.24, neutral:0.00
DL  Sentiment : neutral   | negative:0.23, positive:0.00, neutral:0.77

Top Emotions  : sadness (0.76), fear (0.13), neutral (0.04)

Topic Relevance (%):
  aviation     46.8%
  technology   40.2%
  policy       13.0%
