# Setup

Install required packages (only needed once per environment).  
Packages used:

- `transformers` (Hugging Face) for pretrained models and pipelines
- `torch` as backend for Transformer models
- `pandas`, `numpy` for data wrangling
- `scikit-learn` for metrics and train/test split
- `matplotlib` for plots


In [1]:
!pip install --upgrade pip
!pip install numpy pandas matplotlib scikit-learn transformers torch --extra-index-url https://download.pytorch.org/whl/cpu


Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
    --------------------------------------- 0.0/1.8 MB 495.5 kB/s eta 0:00:04
   -- ------------------------------------- 0.1/1.8 MB 1.1 MB/s eta 0:00:02
   ----- ---------------------------------- 0.2/1.8 MB 1.7 MB/s eta 0:00:01
   ----------- ---------------------------- 0.5/1.8 MB 2.7 MB/s eta 0:00:01
   ------------------------ --------------- 1.1/1.8 MB 4.3 MB/s eta 0:00:01
   ---------------------------------------- 1.8/1.8 MB 5.9 MB/s eta 0:00:00


ERROR: To modify pip, please run the following command:
C:\Users\danco\anaconda3\python.exe -m pip install --upgrade pip


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu
Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/44.0 kB ? eta -:--:--
     ----------------- -------------------- 20.5/44.0 kB 330.3 kB/s eta 0:00:01
     -------------------------------------- 44.0/44.0 kB 535.7 kB/s eta 0:00:00
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.9.0%2Bcpu-cp311-cp311-win_amd64.whl.metadata (29 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Downloading https://downl

In [None]:
import os, re, math, random, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

from transformers import pipeline

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# Path to  dataset file
DATA_PATH = "archive (5)/smile-annotations-final.csv"

print("Using data file:", DATA_PATH)

# Load Data & Quick EDA

Load the SMILE dataset, show basic info, and preview a few rows to understand columns.


In [None]:
df = pd.read_csv(DATA_PATH)

print("Shape:", df.shape)
print("\nColumns:", list(df.columns))
print("\nHead:")
display(df.head(10))

In [None]:

# Attempt to auto-detect text and label columns 
possible_text_cols = [c for c in df.columns if re.search(r"(text|tweet|content|sentence)", c, re.I)]
possible_label_cols = [c for c in df.columns if re.search(r"(label|sentiment|emotion|class|category)", c, re.I)]

text_col  = possible_text_cols[0] if possible_text_cols else df.columns[0]
label_col = possible_label_cols[0] if possible_label_cols else df.columns[1]

print(f"\nAuto-detected text column: {text_col}")
print(f"Auto-detected label column: {label_col}")

# Drop NAs and keep only necessary columns
df = df[[text_col, label_col]].dropna().rename(columns={text_col:"text", label_col:"label"}).reset_index(drop=True)

print("\nUnique labels:", sorted(df['label'].astype(str).unique().tolist()))
print("\nSample rows after renaming:")
display(df.sample(5, random_state=RANDOM_SEED))

# Preprocess: Clean Text & Map Emotions → Sentiment

Clean the text (lowercase, strip URLs/mentions/hashtags), then **map fine-grained emotions** to **positive/negative/neutral**.  
If the dataset already has `positive/negative/neutral`, the mapping will keep them as-is.


In [None]:
URL_RE = re.compile(r"http\S+|www\.\S+")
MENTION_RE = re.compile(r"@\w+")
HASHTAG_RE = re.compile(r"#(\w+)")
MULTISPACE_RE = re.compile(r"\s+")

def clean_text(s: str) -> str:
    s = s.lower()
    s = URL_RE.sub("", s)
    s = MENTION_RE.sub("", s)
    s = HASHTAG_RE.sub(r"\1", s)  # keep hashtag word
    s = re.sub(r"[^\x00-\x7F]+", " ", s)  # drop non-ascii (quick/simple)
    s = MULTISPACE_RE.sub(" ", s).strip()
    return s

df["clean_text"] = df["text"].astype(str).apply(clean_text)

# Define a flexible mapping from common SMILE-like emotion labels
positive_set = {"happy","happiness","joy","love","fun","optimism","surprise","excited","enthusiasm","positive"}
negative_set = {"anger","angry","hate","worry","sad","sadness","boredom","annoyance","disgust","fear","negative"}
neutral_set  = {"neutral","other","none","unknown"}

def map_to_sentiment(lbl: str) -> str:
    l = str(lbl).strip().lower()
    if l in positive_set: return "POSITIVE"
    if l in negative_set: return "NEGATIVE"
    if l in neutral_set:  return "NEUTRAL"
    # Heuristic fallbacks (try to detect word parts)
    if any(p in l for p in ["happy","joy","love","optim"]): return "POSITIVE"
    if any(n in l for n in ["ang","sad","hate","worr","disgust","fear"]): return "NEGATIVE"
    if "neutral" in l: return "NEUTRAL"
    return "NEUTRAL"  # default safe fallback

df["target"] = df["label"].astype(str).apply(map_to_sentiment)

print(df[["text","label","clean_text","target"]].head(8))
print("\nTarget distribution:")
print(df["target"].value_counts(normalize=False))


# Train/Test Split

Create a stratified train/test split so the label proportions are similar across splits.  
We keep only the cleaned text and mapped target for modeling.


In [None]:
X = df["clean_text"].values
Y = df["target"].values

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=RANDOM_SEED, stratify=Y
)

print("Train size:", len(X_train), "| Test size:", len(X_test))

# Build Two Sentiment Pipelines

Build two Hugging Face `pipeline` objects.

- Model A: `distilbert-base-uncased-finetuned-sst-2-english` (binary: POSITIVE/NEGATIVE).
- Model B: `cardiffnlp/twitter-roberta-base-sentiment` (ternary: NEGATIVE/NEUTRAL/POSITIVE).  
  We will post-process so both return one of {NEGATIVE, NEUTRAL, POSITIVE}.


In [None]:
model_a_id = "distilbert-base-uncased-finetuned-sst-2-english"  # binary
model_b_id = "cardiffnlp/twitter-roberta-base-sentiment"        # ternary

clf_a = pipeline("sentiment-analysis", model=model_a_id, top_k=None)  # returns label/score
clf_b = pipeline("sentiment-analysis", model=model_b_id, top_k=None)

def normalize_label(label: str) -> str:
    l = label.upper()
    if "POS" in l: return "POSITIVE"
    if "NEG" in l: return "NEGATIVE"
    if "NEU" in l: return "NEUTRAL"
    # fallback for models without NEUTRAL: use sign
    return l

print("Pipelines ready.")

# Batched Prediction Helper

Define a **batched prediction** function that avoids memory spikes.  
It returns predicted labels and the model's confidence scores.


In [None]:
from typing import List, Tuple

def predict_with_pipeline(texts: List[str], pipe, batch_size: int = 32) -> Tuple[List[str], List[float]]:
    preds, confs = [], []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        out = pipe(batch)
        for item in out:
            # Some pipelines return list of dicts or dict; handle both
            if isinstance(item, list) and len(item) > 0:
                item = max(item, key=lambda d: d.get("score", 0.0))
            label = normalize_label(item["label"])
            score = float(item["score"])
            preds.append(label)
            confs.append(score)
    return preds, confs

# Evaluate One Model

Evaluate **one model** on the test set: accuracy, macro-F1, confusion matrix, and a short classification report.  
We also visualize **prediction confidence** and **predicted label distribution**.


In [None]:
def evaluate_model(name: str, pipe):
    Y_pred, conf = predict_with_pipeline(list(X_test), pipe)
    acc = accuracy_score(Y_test, Y_pred)
    f1m = f1_score(Y_test, Y_pred, average="macro")
    print(f"\n=== {name} ===")
    print(f"Accuracy: {acc:.4f} | Macro-F1: {f1m:.4f}\n")
    print(classification_report(Y_test, Y_pred, digits=4))
    print("Confusion matrix (rows=true, cols=pred):\n", confusion_matrix(Y_test, Y_pred, labels=["NEGATIVE","NEUTRAL","POSITIVE"]))

    # Confidence histogram
    plt.figure()
    plt.hist(conf, bins=20)
    plt.title(f"{name} — Prediction Confidence")
    plt.xlabel("confidence")
    plt.ylabel("count")
    plt.show()

    # Predicted distribution
    pd.Series(Y_pred).value_counts().plot(kind="bar")
    plt.title(f"{name} — Predicted label distribution")
    plt.xlabel("label")
    plt.ylabel("count")
    plt.show()

    return {"name": name, "acc": acc, "f1_macro": f1m}

res_a = evaluate_model("Model A — DistilBERT (SST-2)", clf_a)

# Evaluate Second Model & Compare

**Comment for this Markdown cell:** Evaluate the **second model** and print a compact comparison table so you can see which one performs better on this dataset.


In [None]:
res_b = evaluate_model("Model B — Twitter RoBERTa (3-class)", clf_b)

comp = pd.DataFrame([res_a, res_b]).sort_values("f1_macro", ascending=False).reset_index(drop=True)
print("\nComparison (higher is better):")
display(comp)

# Reusable Prediction Function + Simple Reasoning

**Comment for this Markdown cell:** Provide a **reusable function** to predict on **any list of texts** with either model.  
For **reasoning**: we print the model confidence and highlight whether the text contains simple **sentiment cue words** (a tiny lexicon). This is not true model explainability, but it offers an understandable rationale for beginners.


In [None]:
POS_CUES = {"love","great","good","happy","joy","win","awesome","wonderful","amazing","like"}
NEG_CUES = {"hate","bad","sad","angry","annoyed","worry","worried","terrible","awful","disgust"}

def explain_basic(text: str):
    t = clean_text(text)
    pos_hits = sorted({w for w in POS_CUES if re.search(rf"\\b{re.escape(w)}\\b", t)})
    neg_hits = sorted({w for w in NEG_CUES if re.search(rf"\\b{re.escape(w)}\\b", t)})
    return pos_hits, neg_hits

def predict_texts(texts, model="a"):
    pipe = clf_a if str(model).lower().startswith("a") else clf_b
    preds, confs = predict_with_pipeline([clean_text(t) for t in texts], pipe)
    rows = []
    for text, label, conf in zip(texts, preds, confs):
        pos_hits, neg_hits = explain_basic(text)
        why = []
        why.append(f"model confidence={conf:.3f}")
        if pos_hits: why.append(f"positive cues: {pos_hits}")
        if neg_hits: why.append(f"negative cues: {neg_hits}")
        rows.append({"text": text, "pred": label, "confidence": conf, "why_simple": "; ".join(why)})
    return pd.DataFrame(rows)

# Demo (feel free to edit)
demo_df = predict_texts([
    "I love this new album so much!",
    "This is bad and I'm really angry about it.",
    "It's okay, nothing special."
], model="b")

display(demo_df)


# Creative Application (Your Data!)

**Comment for this Markdown cell:** Apply a chosen model to **your own domain** (e.g., song lyrics, news headlines).  
Below are two examples:

1. A **hardcoded list** (quick demo)
2. Loading from a CSV with a column called `text` (edit path/column for your file)


In [None]:
# Example 1: hardcoded list
my_domain_texts = [
    "Breaking: Markets rally as inflation cools to 2.5%.",
    "Lyrics: I'm feeling good, like I should.",
    "Match report: The team suffers a terrible defeat."
]
creative_results = predict_texts(my_domain_texts, model="b")
display(creative_results)

# Example 2: from a CSV file you provide (uncomment and set your file path)
# custom_df = pd.read_csv("/path/to/your_data.csv")
# creative_results2 = predict_texts(custom_df["text"].astype(str).tolist(), model="b")
# display(creative_results2.head(20))

# BONUS: Fine-Tuning Template (Optional)

**Comment for this Markdown cell:** (Optional) Template for **fine-tuning** a sentiment model with Hugging Face.  
This is a **skeleton** — you must adapt it to your labels and ensure you have enough data.

> Running this section can take time and requires a GPU for best results.


In [None]:
# # Prepare a Dataset object
# train_df = pd.DataFrame({"text": X_train, "label": pd.Categorical(y_train).codes})
# test_df  = pd.DataFrame({"text": X_test,  "label": pd.Categorical(y_test).codes})
# label_names = list(pd.Categorical(df['target']).categories)
# label2id = {name:i for i,name in enumerate(label_names)}
# id2label = {i:name for name,i in label2id.items()}

# ds_train = Dataset.from_pandas(train_df)
# ds_test  = Dataset.from_pandas(test_df)

# def tok_fn(batch):
#     return tokenizer(batch["text"], truncation=True)
# ds_train = ds_train.map(tok_fn, batched=True)
# ds_test  = ds_test.map(tok_fn, batched=True)

# collator = DataCollatorWithPadding(tokenizer=tokenizer)

# model = AutoModelForSequenceClassification.from_pretrained(
#     base_model_id, num_labels=len(label_names), id2label=id2label, label2id=label2id
# )

# args = TrainingArguments(
#     output_dir="./sentiment_ft",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=2,
#     weight_decay=0.01,
#     logging_steps=50
# )

# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=ds_train,
#     eval_dataset=ds_test,
#     tokenizer=tokenizer,
#     data_collator=collator,
# )

# trainer.train()
# trainer.evaluate()


# Conclusion

**What we accomplished:**

- Built a complete **sentiment analysis** workflow with two pretrained models.
- Evaluated with **accuracy, macro-F1**, and confusion matrices.
- Visualized **confidence** and **label distributions**.
- Created a reusable **prediction function** and applied it to a new domain.
- (Bonus) Prepared a **fine-tuning template** for domain adaptation.

**Next steps:**

- Collect more domain-specific labeled data and try the fine-tuning section.
- Explore more models in the Hugging Face Hub (larger RoBERTa/BERT variants).
- Add true explainability (e.g., SHAP, LIME) once you are comfortable with the basics.
