In [None]:
!pip install transformers kaggle

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import math
import time
from tqdm.notebook import trange, tqdm
from functools import partial

import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader
from torch import Tensor
from datasets import Dataset
import torch.nn.functional as F
from torch.distributions import Categorical
from torch.cuda.amp import autocast, GradScaler

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, concatenate_datasets

torch.backends.cuda.matmul.allow_tf32 = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# Datsets Used:
# https://huggingface.co/datasets/SetFit/sst5
# https://huggingface.co/datasets/cardiffnlp/tweet_eval
# https://huggingface.co/datasets/antitheft159/_1342_political_sentiment_analysis
# https://huggingface.co/datasets/sara-nabhani/ML-news-sentiment

# https://www.kaggle.com/datasets/clovisdalmolinvieira/news-sentiment-analysis
# https://www.kaggle.com/datasets/hoshi7/news-sentiment-dataset

In [None]:
from google.colab import drive

drive.mount("/content/drive")

In [None]:
%cd /content/drive/My Drive/Machine Learning/political-pulse-model

In [None]:
model_id = "microsoft/deberta-v3-base"  # or -large

tokenizer = AutoTokenizer.from_pretrained(model_id)

# num_labels=1 + problem_type="regression" lets the head output a single scalar
model = (
    AutoModelForSequenceClassification.from_pretrained(
        model_id, num_labels=1, problem_type="regression"
    )
    .to(device)
    .eval()
)

tokenizer.vocab_size

In [None]:
from google.colab import files

files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d clovisdalmolinvieira/news-sentiment-analysis

In [None]:
!mkdir -p news_sentiment_data
!unzip -q news-sentiment-analysis.zip -d news_sentiment_data
!ls

In [None]:
!kaggle datasets download -d hoshi7/news-sentiment-dataset

In [None]:
!mkdir -p news_sentiment_data_2
!unzip -q news-sentiment-dataset.zip -d news_sentiment_data_2
!ls

In [None]:
kaggle_df_1 = pd.read_csv("news_sentiment_data/news_sentiment_analysis.csv")
kaggle_df_2 = pd.read_csv("news_sentiment_data_2/Sentiment_dataset.csv")

kaggle_dataset_1 = Dataset.from_pandas(kaggle_df_1)
kaggle_dataset_2 = Dataset.from_pandas(kaggle_df_2)

len(kaggle_dataset_1), len(kaggle_dataset_2)

In [None]:
def normalize_sentiment(item, column_name: str = "sentiment"):
    if item[column_name].strip() == "positive":
        item["label"] = 1
    elif item[column_name].strip() == "neutral":
        item["label"] = 0.5
    elif item[column_name].strip() == "negative":
        item["label"] = 0
    else:
        raise ValueError("The sentiment for this item does not match")

    return item

In [None]:
news_normalize_sentiment_1 = partial(normalize_sentiment, column_name="Sentiment")

kaggle_dataset_1 = kaggle_dataset_1.map(
    news_normalize_sentiment_1, remove_columns=["Sentiment"]
)
kaggle_dataset_1 = kaggle_dataset_1.rename_column("Description", "text")

In [None]:
kaggle_dataset_2 = kaggle_dataset_2.rename_column("sentiment", "label")

In [None]:
sst5_dataset = load_dataset("SetFit/sst5")
sst5_dataset = sst5_dataset.remove_columns("label_text")
train_sst_5_datset, validation_sst_5_datset, test_sst_5_datset = (
    sst5_dataset["train"],
    sst5_dataset["validation"],
    sst5_dataset["test"],
)

len(train_sst_5_datset), len(validation_sst_5_datset), len(test_sst_5_datset)

In [None]:
def normalize_labels(item, column_name: str = "label", factor: int = 4):
    item[column_name] = item[column_name] / factor
    return item

In [None]:
sst_normalize_labels = partial(normalize_labels, column_name="label", factor=4)

train_sst_5_datset = train_sst_5_datset.map(sst_normalize_labels)
validation_sst_5_datset = validation_sst_5_datset.map(sst_normalize_labels)
test_sst_5_datset = test_sst_5_datset.map(sst_normalize_labels)

print(train_sst_5_datset["label"][:10])

In [None]:
tweet_eval_dataset = load_dataset("cardiffnlp/tweet_eval", "sentiment")
train_tweet_eval_dataset, validation_tweet_eval_dataset, test_tweet_eval_dataset = (
    tweet_eval_dataset["train"],
    tweet_eval_dataset["validation"],
    tweet_eval_dataset["test"],
)

len(train_tweet_eval_dataset), len(validation_tweet_eval_dataset), len(
    test_tweet_eval_dataset
)

In [None]:
tweet_eval_normalize_labels = partial(normalize_labels, column_name="label", factor=2)

train_tweet_eval_dataset = train_tweet_eval_dataset.map(tweet_eval_normalize_labels)
validation_tweet_eval_dataset = validation_tweet_eval_dataset.map(
    tweet_eval_normalize_labels
)
test_tweet_eval_dataset = test_tweet_eval_dataset.map(tweet_eval_normalize_labels)

print(train_tweet_eval_dataset["label"][:10])

In [None]:
political_sentiment_dataset = load_dataset(
    "antitheft159/_1342_political_sentiment_analysis"
)
political_sentiment_dataset = political_sentiment_dataset.rename_column(
    "tweet_text", "text"
)

(
    train_political_sentiment_dataset,
    validation_political_sentiment_dataset,
    test_political_sentiment_dataset,
) = (
    political_sentiment_dataset["train"],
    political_sentiment_dataset["validation"],
    political_sentiment_dataset["test"],
)

len(train_political_sentiment_dataset), len(
    validation_political_sentiment_dataset
), len(test_political_sentiment_dataset)

In [None]:
political_normalize_sentiment = partial(normalize_sentiment, column_name="sentiment")

train_political_sentiment_dataset = train_political_sentiment_dataset.map(
    political_normalize_sentiment, remove_columns=["sentiment"]
)
validation_political_sentiment_dataset = validation_political_sentiment_dataset.map(
    political_normalize_sentiment, remove_columns=["sentiment"]
)
test_political_sentiment_dataset = test_political_sentiment_dataset.map(
    political_normalize_sentiment, remove_columns=["sentiment"]
)


print(train_political_sentiment_dataset["label"][:10])

In [None]:
ml_news_sentiment_dataset = load_dataset("sara-nabhani/ML-news-sentiment")
train_ml_news_sentiment_dataset, test_ml_news_sentiment_dataset = (
    ml_news_sentiment_dataset["train"],
    ml_news_sentiment_dataset["test"],
)

train_ml_news_sentiment_dataset = train_ml_news_sentiment_dataset.rename_column(
    "labels", "label"
)
test_ml_news_sentiment_dataset = test_ml_news_sentiment_dataset.rename_column(
    "labels", "label"
)

len(train_ml_news_sentiment_dataset), len(test_ml_news_sentiment_dataset)

In [None]:
news_sentiment_normalize_labels = partial(
    normalize_labels, column_name="label", factor=2
)

train_ml_news_sentiment_dataset = train_ml_news_sentiment_dataset.map(
    news_sentiment_normalize_labels
)
test_ml_news_sentiment_dataset = test_ml_news_sentiment_dataset.map(
    news_sentiment_normalize_labels
)

print(train_ml_news_sentiment_dataset["label"][:10])

In [None]:
train_dataset = concatenate_datasets(
    [
        train_sst_5_datset,
        train_tweet_eval_dataset,
        train_political_sentiment_dataset,
        train_ml_news_sentiment_dataset,
        kaggle_dataset_1,
        kaggle_dataset_2,
    ]
)
validation_dataset = concatenate_datasets(
    [
        validation_sst_5_datset,
        validation_tweet_eval_dataset,
        validation_political_sentiment_dataset,
    ]
)
test_dataset = concatenate_datasets(
    [
        test_sst_5_datset,
        test_tweet_eval_dataset,
        test_political_sentiment_dataset,
        test_ml_news_sentiment_dataset,
    ]
)

len(train_dataset), len(validation_dataset), len(test_dataset)

In [None]:
def tokenize(batch):
    encoded = tokenizer(batch["text"], truncation=True, padding=False, max_length=512)

    return encoded


train_tokenized_dataset = train_dataset.map(
    tokenize, batched=True, remove_columns=["text"]
)
validation_tokenized_dataset = validation_dataset.map(
    tokenize, batched=True, remove_columns=["text"]
)
test_tokenized_dataset = test_dataset.map(
    tokenize, batched=True, remove_columns=["text"]
)

len(train_tokenized_dataset), len(validation_tokenized_dataset), len(
    test_tokenized_dataset
)

In [None]:
SEED = 42
train_tokenized_dataset = train_tokenized_dataset.shuffle(seed=SEED)

train_tokenized_dataset = train_tokenized_dataset.with_format(
    "torch", columns=["input_ids", "attention_mask", "label"]
)
validation_tokenized_dataset = validation_tokenized_dataset.with_format(
    "torch", columns=["input_ids", "attention_mask", "label"]
)
test_tokenized_dataset = test_tokenized_dataset.with_format(
    "torch", columns=["input_ids", "attention_mask", "label"]
)

len(train_tokenized_dataset), len(validation_tokenized_dataset), len(
    test_tokenized_dataset
)

In [None]:
train_tokenized_dataset = concatenate_datasets(
    [train_tokenized_dataset, validation_tokenized_dataset]
)

len(train_tokenized_dataset), len(test_tokenized_dataset)

In [None]:
from transformers import DataCollatorWithPadding

collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")
batch_size = 16

train_dataloader = DataLoader(
    train_tokenized_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collator,
    num_workers=os.cpu_count(),
    pin_memory=True,
)

validation_dataloader = DataLoader(
    validation_tokenized_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collator,
    num_workers=os.cpu_count(),
    pin_memory=True,
)

test_dataloader = DataLoader(
    test_tokenized_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collator,
    num_workers=os.cpu_count(),
    pin_memory=True,
)

In [None]:
print("[PAD] token id:", tokenizer.pad_token_id)  # id: 0
print("[CLS] token id:", tokenizer.cls_token_id)  # id: 1
print("[SEP] token id:", tokenizer.sep_token_id)  # id: 2

In [None]:
batch = next(iter(train_dataloader))
print(batch["input_ids"].shape, batch["attention_mask"].shape, batch["labels"].shape)

In [None]:
print(
    f"The model has {(sum(p.numel() for p in model.parameters() if p.requires_grad)):,} trainable parameters"
)

In [None]:
from transformers import get_linear_schedule_with_warmup

lr = 1e-5
epochs = 10
clip = 1

optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr, weight_decay=0.01)
# loss_fn = nn.BCEWithLogitsLoss()
loss_fn = nn.SmoothL1Loss(beta=0.1)
scaler = GradScaler()

num_steps = len(train_dataloader) * epochs
num_warmup = int(0.1 * num_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup, num_training_steps=num_steps
)

In [None]:
def train(model, iterator, optimizer, loss_fn, clip, epoch):
    model.train()
    epoch_loss = 0

    pbar = tqdm(
        iterator,
        total=len(iterator),
        desc=f"Epoch {epoch + 1} Progress",
        colour="#005500",
    )
    for i, batch in enumerate(pbar):
        src = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        with autocast():
            # Forward pass
            outputs = model(
                input_ids=src,
                attention_mask=mask,
            )
            logits = outputs.logits.squeeze(-1)  # shape: (batch_size)
            probs = torch.sigmoid(logits)

            # Calculate the loss
            # loss = loss_fn(logits, labels)
            loss = loss_fn(probs, labels)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        scaler.step(optimizer)
        scheduler.step()
        scaler.update()
        epoch_loss += loss.item()

        pbar.set_postfix(loss=loss.item())  # Update the loss on the tqdm progress bar

    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, model_path, iterator, loss_fn):
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    epoch_loss = 0

    with torch.inference_mode():
        for i, batch in enumerate(tqdm(iterator)):
            src = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(
                input_ids=src,
                attention_mask=mask,
            )
            logits = outputs.logits.squeeze(-1)  # shape: (batch_size)
            probs = torch.sigmoid(logits)

            # Calculate the loss
            # loss = loss_fn(logits, labels)
            loss = loss_fn(probs, labels)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [None]:
def get_accuracy(model, model_path, iterator):
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    num_correct = 0
    total = 0

    with torch.inference_mode():
        for i, batch in enumerate(tqdm(iterator)):
            src = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(
                input_ids=src,
                attention_mask=mask,
            )
            logits = outputs.logits.squeeze(-1)  # shape: (batch_size)

            # Calculate the accuracy
            probs = torch.sigmoid(logits)
            preds = probs >= 0.5
            truths = labels >= 0.5

            num_correct += (preds == truths).sum().item()
            total += labels.size(0)

    return num_correct / total

In [None]:
def get_prob_accuracy(model, model_path, iterator, tolerance: float = 0.2):
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    num_correct = 0
    total = 0

    with torch.inference_mode():
        for i, batch in enumerate(tqdm(iterator)):
            src = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(
                input_ids=src,
                attention_mask=mask,
            )
            logits = outputs.logits.squeeze(-1)  # shape: (batch_size)

            # Calculate the accuracy
            probs = torch.sigmoid(logits)

            diffs = torch.abs(probs - labels)
            num_correct += (diffs <= tolerance).sum().item()

            total += labels.size(0)

    return num_correct / total

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
best_valid_loss = float("inf")
model_path = "political_pulse_model.pt"

if os.path.exists(model_path):
    print(f"Loading model from {model_path}...")
    model.load_state_dict(torch.load(model_path, map_location=device))

In [None]:
should_train = True

if should_train:
    for epoch in tqdm(range(epochs), desc=f"Training progress", colour="#00ff00"):
        start_time = time.time()

        train_loss = train(
            model=model,
            iterator=train_dataloader,
            optimizer=optimizer,
            loss_fn=loss_fn,
            clip=clip,
            epoch=epoch,
        )

        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        message = f"Epoch: {epoch + 1} | Time: {epoch_mins}m {epoch_secs}s --> STORED"

        torch.save(model.state_dict(), f"political_pulse_model_{epoch + 1}.pt")

        test_loss = evaluate(
            model=model,
            model_path=f"political_pulse_model_{epoch + 1}.pt",
            iterator=test_dataloader,
            loss_fn=loss_fn,
        )

        test_acc = get_accuracy(
            model=model,
            model_path=f"political_pulse_model_{epoch + 1}.pt",
            iterator=test_dataloader,
        )

        test_prob_acc = get_prob_accuracy(
            model=model,
            model_path=f"political_pulse_model_{epoch + 1}.pt",
            iterator=test_dataloader,
            tolerance=0.2,
        )

        print(message)
        print(
            f"Train Loss: {train_loss:.6f} | Test Loss: {test_loss:.6f} | Test Accuracy: {test_acc:.6f} | Test Prob Accuracy: {test_prob_acc:.6f}"
        )

In [None]:
model.load_state_dict(torch.load("political_pulse_model_8.pt", map_location=device))
model.eval()

In [None]:
def score_text_sentiment(text: str, target: str | None = None):
    marked = f"<TARGET> {target} </TARGET> [SEP] {text}" if target else text
    inputs = tokenizer(marked, return_tensors="pt", truncation=True, max_length=512).to(
        device
    )

    with torch.inference_mode():
        out = model(
            input_ids=inputs.input_ids, attention_mask=inputs.attention_mask
        ).logits.squeeze(
            -1
        )  # shape: (1)

    prob = torch.sigmoid(out).item()
    return prob

In [None]:
print(
    score_text_sentiment(
        "Constituents loved the representative.", target="Rep. Nikema Williams"
    )
)

In [None]:
!pip install onnx onnxruntime

In [None]:
import onnx
import transformers
import transformers.convert_graph_to_onnx as onnx_convert
from pathlib import Path

model.load_state_dict(torch.load("political_pulse_model_8.pt", map_location="cpu"))
model.eval()
model = model.to("cpu")

pipeline = transformers.pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
model = model.to("cpu")

In [None]:
with torch.no_grad():
    onnx_convert.convert_pytorch(
        pipeline,
        opset=14,
        output=Path("political_pulse_sentiment_analysis.onnx"),
        use_external_format=False,
    )

In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType

quantize_dynamic(
    "political_pulse_sentiment_analysis.onnx",
    "political_pulse_sentiment_analysis_int8.onnx",
    weight_type=QuantType.QUInt8,
)