<br>
<font>
<div dir=ltr align=center>
<img src="https://cdn.freebiesupply.com/logos/large/2x/sharif-logo-png-transparent.png" width=150 height=150> <br>
<font color=0F5298 size=7>
    Machine learning <br>
<font color=2565AE size=5>
    Computer Engineering Department <br>
    Fall 2025<br>
<font color=3C99D size=5>
    Sentiment Analysis with Transformer <br>
</div>
<div dir=ltr align=center>
<font color=0CBCDF size=4>
    Mohammad Ebrahimian, Taha Izadi, Nima Ghadirniya
<font color=0CBCDF size=4>
</div>

____

  <h1 style="color:#0F5298; font-family:serif; font-size:45px; margin-bottom:0px;">
    Setup and Libraries
  </h1>

In [None]:
!pip install gensim
!pip install datasets==2.16.1

In [None]:
import os
import gc
import math
import glob
import json
import random
import re
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import seaborn as sns
import torch.optim as optim
from itertools import product
import torch.nn.functional as F
import gensim.downloader as api
import matplotlib.pyplot as plt
from datetime import datetime
from datasets import load_dataset
from dataclasses import dataclass
from sklearn.metrics import f1_score
from transformers import BertTokenizer
from typing import Dict, List, Optional
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)

set_seed(42)
print("üå± Seed set to 42 for reproducibility.")

device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Device set to: {device}")

  <h1 style="color:#0F5298; font-family:serif; font-size:45px; margin-bottom:0px;">
    Data Loading
  </h1>

In [None]:
def load_data_from_file():
    ds = load_dataset(
        "financial_phrasebank",
        "sentences_allagree",
        trust_remote_code=True,
        streaming=False
    )
    df = ds["train"].to_pandas()
    df = df.rename(columns={"sentence": "text"})

    df = df.drop_duplicates(subset=['text'])
    df = df.dropna(subset=['text', 'label'])

    df['text'] = df['text'].str.lower().str.strip()

    df = df.dropna(subset=['label'])
    df['label'] = df['label'].astype(int)

    return df.reset_index(drop=True)

def plot_sentiment_distribution(df):
    plt.figure(figsize=(8, 5))
    ax = sns.countplot(x='label', data=df, palette='viridis', hue='label', legend=False)
    plt.title('Distribution of Sentiments (Sentences-AllAgree)')
    plt.xlabel('Class (0: Neg, 1: Neu, 2: Pos)')
    plt.ylabel('Count')
    plt.xticks([0, 1, 2], ['Negative', 'Neutral', 'Positive'])

    for p in ax.patches:
        ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha='center', va='center', xytext=(0, 10), textcoords='offset points')
    plt.show()

df = load_data_from_file()
plot_sentiment_distribution(df)
print(f"Total unique samples: {len(df)}")
print("\nüìù Samples per Class:")
label_names = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}

for label in [0, 1, 2]:
    sample_text = df[df['label'] == label]['text'].iloc[0]
    print(f"   - {label_names[label]} (Label {label}): \"{sample_text[:100]}...\"")

  <h1 style="color:#0F5298; font-family:serif; font-size:45px; margin-bottom:0px;">
    Word2Vec Pre-trained embeding
  </h1>

In [None]:
def create_hybrid_embeddings(tokenizer, d_emb=300):
    print("Loading Word2Vec model...")
    try:
        word2vec = api.load("word2vec-google-news-300")
    except Exception as e:
        print(f"Word2Vec load failed: {e}")
        vocab_size = len(tokenizer)
        mat = np.random.normal(0.0, 0.02, (vocab_size, d_emb)).astype(np.float32)
        if tokenizer.pad_token_id is not None:
            mat[tokenizer.pad_token_id] = 0.0
        return torch.from_numpy(mat)

    vocab = tokenizer.get_vocab()
    vocab_size = len(vocab)
    mat = np.random.normal(0.0, 0.02, (vocab_size, d_emb)).astype(np.float32)

    special_ids = set(tokenizer.all_special_ids)
    hits, misses, skipped_subword = 0, 0, 0

    for token, idx in vocab.items():
        if idx in special_ids:
            continue
        if token.startswith("##"):
            skipped_subword += 1
            continue

        if token in word2vec:
            mat[idx] = word2vec[token]
            hits += 1
        elif token.lower() in word2vec:
            mat[idx] = word2vec[token.lower()]
            hits += 1
        else:
            misses += 1

    if tokenizer.pad_token_id is not None:
        mat[tokenizer.pad_token_id] = 0.0

    del word2vec
    gc.collect()

    eligible = hits + misses
    cov = (hits / eligible * 100) if eligible > 0 else 0.0
    print(f"Shape: {mat.shape}")
    print(f"Hits: {hits}, Misses: {misses}, Skipped subwords: {skipped_subword}")
    print(f"Coverage on eligible tokens: {cov:.1f}%")

    return torch.from_numpy(mat)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
pretrained_embeddings = create_hybrid_embeddings(tokenizer)

  <h1 style="color:#0F5298; font-family:serif; font-size:45px; margin-bottom:0px;">
    Data Spliting
  </h1>

In [None]:
SEED = 42
TRAIN_SIZE = 0.80
VAL_SIZE = 0.10
TEST_SIZE = 0.10

assert abs(TRAIN_SIZE + VAL_SIZE + TEST_SIZE - 1.0) < 1e-8, "Split ratios must sum to 1."

if df["label"].dtype == object:
    label_map = {"negative": 0, "neutral": 1, "positive": 2}
    df["label"] = df["label"].map(label_map)

df = df.dropna(subset=["text", "label"]).copy()
df["label"] = df["label"].astype(int)
df = df.reset_index(drop=True)

train_df, temp_df = train_test_split(
    df,
    test_size=(1 - TRAIN_SIZE),
    random_state=SEED,
    stratify=df["label"],
)

val_ratio_in_temp = VAL_SIZE / (VAL_SIZE + TEST_SIZE)
val_df, test_df = train_test_split(
    temp_df,
    test_size=(1 - val_ratio_in_temp),
    random_state=SEED,
    stratify=temp_df["label"],
)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

def show_split_stats(name, part_df):
    counts = part_df["label"].value_counts().sort_index()
    ratios = (part_df["label"].value_counts(normalize=True).sort_index() * 100).round(2)
    print(f"{name}: n={len(part_df)}")
    print("counts:", counts.to_dict())
    print("ratios(%):", ratios.to_dict())
    print("-" * 50)

show_split_stats("Train", train_df)
show_split_stats("Validation", val_df)
show_split_stats("Test", test_df)

  <h1 style="color:#0F5298; font-family:serif; font-size:45px; margin-bottom:0px;">
    Augmentation
  </h1>

In [None]:
COMMON_SYNONYMS = {
    "company": ["firm", "business"],
    "market": ["sector", "marketplace"],
    "shares": ["stock", "equity"],
    "announced": ["reported", "stated"],
    "increase": ["rise", "growth"],
    "decrease": ["decline", "drop"],
    "cost": ["expense", "charge"],
    "revenue": ["sales", "turnover"],
}

LABEL_SYNONYMS = {
    0: {"loss": ["deficit", "setback"], "risk": ["threat", "uncertainty"], "fall": ["drop", "decline"]},
    1: {"said": ["stated", "noted"], "expects": ["anticipates", "foresees"], "plan": ["strategy", "program"]},
    2: {"profit": ["gain", "earnings"], "growth": ["expansion", "rise"], "strong": ["solid", "robust"]},
}

PROTECTED_WORDS = {"not", "no", "never", "none", "without"}


def _normalize_token(token: str) -> str:
    return re.sub(r"^[^A-Za-z0-9]+|[^A-Za-z0-9]+$", "", token).lower()


def _replace_token_keep_format(raw_token: str, new_core: str) -> str:
    m = re.match(r"^([^A-Za-z0-9]*)([A-Za-z0-9'-]+)([^A-Za-z0-9]*)$", raw_token)
    if not m:
        return raw_token
    prefix, core, suffix = m.groups()
    if core.isupper():
        new_core = new_core.upper()
    elif core[:1].isupper():
        new_core = new_core.capitalize()
    return f"{prefix}{new_core}{suffix}"


def augment_text_label_aware(text: str, label: int, rng: random.Random, max_repl: int = 2, swap_prob: float = 0.10):
    words = text.split()
    if len(words) < 3:
        return text

    syn_map = {**COMMON_SYNONYMS, **LABEL_SYNONYMS.get(int(label), {})}
    candidates = []

    for i, w in enumerate(words):
        key = _normalize_token(w)
        if not key or key in PROTECTED_WORDS or any(ch.isdigit() for ch in key):
            continue
        if key in syn_map:
            candidates.append((i, key))

    rng.shuffle(candidates)

    if candidates:
        n_rep = rng.randint(1, min(max_repl, len(candidates)))
        for i, key in candidates[:n_rep]:
            replacement = rng.choice(syn_map[key])
            words[i] = _replace_token_keep_format(words[i], replacement)

    if rng.random() < swap_prob and len(words) >= 5:
        j = rng.randrange(0, len(words) - 1)
        words[j], words[j + 1] = words[j + 1], words[j]

    aug = " ".join(words).strip()
    return aug if aug else text


def _compute_target_counts(class_counts: pd.Series, balance_strength: float = 0.45, max_growth: float = 1.40):
    class_counts = class_counts.sort_index()
    max_count = int(class_counts.max())
    orig_total = int(class_counts.sum())

    targets = {}
    for label, count in class_counts.items():
        boosted = int(round(count + balance_strength * (max_count - count)))
        targets[int(label)] = max(int(count), boosted)

    max_total = int(round(orig_total * max_growth))
    proposed_total = sum(targets.values())

    if proposed_total > max_total and proposed_total > orig_total:
        proposed_extra = proposed_total - orig_total
        allowed_extra = max_total - orig_total
        scale = allowed_extra / proposed_extra if proposed_extra > 0 else 0.0
        for label, count in class_counts.items():
            extra = targets[int(label)] - int(count)
            scaled_extra = int(round(extra * scale))
            targets[int(label)] = int(count) + max(0, scaled_extra)

    return targets

# Augmenting training data
def build_controlled_augmented_train_df(
    train_df: pd.DataFrame,
    seed: int = 42,
    balance_strength: float = 1.0,
    max_growth: float = 10.0,
):
    rng = random.Random(seed)

    base = train_df[["text", "label"]].copy().reset_index(drop=True)
    base["is_augmented"] = 0

    class_counts = base["label"].value_counts().sort_index()
    target_counts = _compute_target_counts(class_counts, balance_strength=balance_strength, max_growth=max_growth)

    parts = []
    for label, grp in base.groupby("label", sort=True):
        grp = grp.copy().reset_index(drop=True)
        originals = grp["text"].tolist()
        seen = set(t.strip().lower() for t in originals)

        need = max(0, target_counts[int(label)] - len(grp))
        new_rows = []
        attempts = 0
        max_attempts = max(200, need * 20)

        while len(new_rows) < need and attempts < max_attempts:
            src = originals[rng.randrange(len(originals))]
            aug = augment_text_label_aware(src, int(label), rng, max_repl=2, swap_prob=0.10)
            attempts += 1

            key = aug.strip().lower()
            if not key or key in seen:
                continue

            seen.add(key)
            new_rows.append({"text": aug, "label": int(label), "is_augmented": 1})

        if len(new_rows) < need:
            remain = need - len(new_rows)
            sampled = grp.sample(n=remain, replace=True, random_state=seed)["text"].tolist()
            for src in sampled:
                aug = augment_text_label_aware(src, int(label), rng, max_repl=1, swap_prob=0.05)
                new_rows.append({"text": aug, "label": int(label), "is_augmented": 1})

        parts.append(grp)
        if new_rows:
            parts.append(pd.DataFrame(new_rows))

    train_aug_df = pd.concat(parts, ignore_index=True)
    train_aug_df = train_aug_df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    return train_aug_df


def show_counts(train_part, val_part, test_part, title):
    print(f"\n{title}")
    print("=" * len(title))
    for name, part in [("Train", train_part), ("Validation", val_part), ("Test", test_part)]:
        c = part["label"].value_counts().sort_index().to_dict()
        print(f"{name}: n={len(part)} | class_counts={c}")


assert "train_df" in globals() and "val_df" in globals() and "test_df" in globals(), "run data spliting block first"

seed_value = SEED if "SEED" in globals() else 42

show_counts(train_df, val_df, test_df, "Before Augmentation")

train_aug_df = build_controlled_augmented_train_df(
    train_df=train_df,
    seed=seed_value,
    balance_strength=1,
    max_growth=3,
)

show_counts(train_aug_df, val_df, test_df, "After Augmentation")
print(f"Added train samples: {len(train_aug_df) - len(train_df)}")
print("Augmented flag:", train_aug_df["is_augmented"].value_counts().to_dict())

