# ADSP 32018: Final Project
## Sentiment Analysis

Peyton Nash

### Setup

In [106]:
%pip install -r requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
%pip install pyarrow

Collecting pyarrow
  Using cached pyarrow-21.0.0-cp310-cp310-macosx_12_0_arm64.whl.metadata (3.3 kB)
Using cached pyarrow-21.0.0-cp310-cp310-macosx_12_0_arm64.whl (31.2 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-21.0.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

import numpy as np
import pandas as pd
import torch
from collections import Counter

import matplotlib.pyplot as plt

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer
)
from transformers import Trainer, TrainingArguments

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
    confusion_matrix,
    f1_score
)

import torch
import torch.nn as nn
import accelerate

In [None]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print(f"Setting device to '{device}'")

Setting device to 'cpu'


In [None]:
# Load data
df = pd.read_parquet('output_data2/labeled2.parquet')[['text', 'sentiment']]
df = df[~df.sentiment.isna()]

In [None]:
label_map = {label: idx for idx, label in enumerate(set(df['sentiment']))}

df['label'] = df['sentiment'].map(label_map)

# Commented out because tryng with weights instead of class balance
## Create separate DFs for each class
#df_pos = df[df.sentiment == 'POSITIVE']
#df_neg = df[df.sentiment == 'NEGATIVE']
#df_net = df[df.sentiment == 'NEUTRAL']
#
## Downsample larger classes
#size = min(len(df_pos), len(df_neg), len(df_net))
#
#df_pos = resample(df_pos, replace=False, n_samples=size, random_state=42) if len(df_pos) != size else df_pos
#df_neg = resample(df_neg, replace=False, n_samples=size, random_state=42) if len(df_neg) != size else df_neg
#df_net = resample(df_net, replace=False, n_samples=size, random_state=42) if len(df_net) != size else df_net
#
## Concatenate
#df_ds = pd.concat([df_pos, df_neg, df_net])
#df_ds = df_ds.sample(frac=1).reset_index(drop=True)


In [None]:
# Split into train and test with class balance
train, test = train_test_split(
    df,
    test_size=0.3,
    stratify=df['label'],
    random_state=42
)

test, eval = train_test_split(
    test,
    test_size=0.5,
    stratify=test['label'],
    random_state=42
)

print(f'Balanced distribution of labels: {train.groupby("sentiment")["sentiment"].count()}')

Balanced distribution of labels: sentiment
NEGATIVE    141
NEUTRAL     386
POSITIVE    617
Name: sentiment, dtype: int64


### Model Training

In [115]:
n_classes = len(set(df['sentiment']))
model_name = 'FacebookAI/roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=n_classes).to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [116]:
# Define vars
MAX_LEN = 512
CHUNK_SIZE = 510
STRIDE = 300

# Tokenize in chunks
def chunk_and_tokenize(text, label):
    # Tokenize
    inputs = tokenizer(
        str(text),
        truncation=True,
        padding="max_length",
        max_length=CHUNK_SIZE,
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_tensors="pt"
    )

    # Attach label to each chunk
    input_chunks = []
    for i in range(len(inputs["input_ids"])):
        input_chunks.append({
            "input_ids": inputs["input_ids"][i],
            "attention_mask": inputs["attention_mask"][i],
            "label": torch.tensor(label)
        })
    return input_chunks

train_chunk = []
for _, row in train.iterrows():
    train_chunk.extend(chunk_and_tokenize(row['text'], row['label']))

test_chunk = []
for _, row in test.iterrows():
    test_chunk.extend(chunk_and_tokenize(row['text'], row['label']))

eval_chunk = []
for _, row in eval.iterrows():
    eval_chunk.extend(chunk_and_tokenize(row['text'], row['label']))

In [117]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

In [119]:
# Weight loss by class
df['label'] = df['sentiment'].map(label_map)
class_counts = np.bincount(df['label'])
class_weights = 1.0/class_counts
class_weights = class_weights / class_weights.sum() * n_classes

class_weights = torch.tensor(class_weights, dtype=torch.float)
class_weights = class_weights.to(model.device)

def weighted_loss(outputs, label):
    loss_fct = nn.CrossEntropyLoss(weight=class_weights)
    return loss_fct(outputs.logits, labels)

In [124]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = nn.CrossEntropyLoss(weight=class_weights.to(model.device))(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

In [125]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy='epoch',
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_chunk,
    eval_dataset=eval_chunk
)
#trainer.train()

In [126]:
trainer.train()



Epoch,Training Loss,Validation Loss


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG15XFamilyCommandBuffer: 0x17b0c8910>
    label = <none> 
    device = <AGXG15SDevice: 0x127d1ee00>
        name = Apple M3 Pro 
    commandQueue = <AGXG15XFamilyCommandQueue: 0x127a9aa00>
        label = <none> 
        device = <AGXG15SDevice: 0x127d1ee00>
            name = Apple M3 Pro 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG15XFamilyCommandBuffer: 0x17b0f6dc0>
    label = <none> 
    device = <AGXG15SDevice: 0x127d1ee00>
        name = Apple M3 Pro 
    commandQueue = <AGXG15XFamilyCommandQueue: 0x127a9aa00>
        label = <none> 
        device =

KeyboardInterrupt: 

#### Document-Level Metrics

In [None]:
def predict_document(text, model, tokenizer, chunk_size=510, stride=300):
    inputs = tokenizer(
        str(text),
        truncation=True,
        padding=True,
        max_length=chunk_size,
        stride=stride,
        return_overflowing_tokens=True,
        return_tensors="pt"
    )

    logits_list = []
    for i in range(len(inputs["input_ids"])):
        input_ids = inputs["input_ids"][i].unsqueeze(0).to(device)
        attention_mask = inputs["attention_mask"][i].unsqueeze(0).to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits_list.append(outputs.logits.cpu().numpy())

    # Stack logits
    logits_array = np.vstack(logits_list)

    # Logarithmic weights (later chunks higher weight)
    #weights = np.log1p(np.arange(len(logits_array)))  # log(1), log(2), ..., log(n)

    # Handle case where there's only 1 chunk (weights=0) → set to 1
    #if weights.sum() == 0:
    #    weights = np.ones_like(weights)

    # Weighted average of logits
    weighted_avg_logits = np.max(logits_array, axis=0)

    # Prediction
    pred_label = int(np.argmax(weighted_avg_logits))
    return pred_label

In [None]:
model.to(device)
model.eval()

test_preds = []
for text in test['text'].values:
    pred = predict_document(text, model, tokenizer)
    test_preds.append(pred)

In [None]:
test_labels = list(test['label'])

accuracy = accuracy_score(test_labels, test_preds)
f1 = f1_score(test_labels, test_preds, average='weighted')

print("Document-level Test Accuracy:", accuracy)
print("Document-level Test F1:", f1)

Document-level Test Accuracy: 0.7107438016528925
Document-level Test F1: 0.6932980739798921


In [None]:
df[['label', 'sentiment']]

Unnamed: 0,label,sentiment
0,1,NEUTRAL
1,1,NEUTRAL
2,1,NEUTRAL
3,0,POSITIVE
4,2,NEGATIVE
...,...,...
1995,1,NEUTRAL
1996,0,POSITIVE
1997,1,NEUTRAL
1998,1,NEUTRAL


In [None]:
print(confusion_matrix(test_labels, test_preds))

[[33  5  2]
 [ 8 16 16]
 [ 0  4 37]]


#### Chunk-Level Metrics

In [None]:
# Create containers
preds = []
labels = []

# Evaluate
with torch.no_grad():
    for i in test_chunk:
        input_ids = i['input_ids'].unsqueeze(0).to(device)
        attention_mask = i['attention_mask'].unsqueeze(0).to(device)
        label = i['label'].unsqueeze(0).to(device)

        output = model(input_ids, attention_mask=attention_mask)
        pred = torch.argmax(output.logits, dim=-1)

        preds.extend(pred)
        labels.extend(label)

In [None]:
accuracy = accuracy_score(labels, preds)
f1 = f1_score(labels, preds, average='weighted')

print("Chunk-level Test Accuracy:", accuracy)
print("Chunk-level Test F1:", f1)

Chunk-level Test Accuracy: 0.6512345679012346
Chunk-level Test F1: 0.6483557747342976


In [None]:
print(confusion_matrix(labels, preds))

[[87 29  9]
 [18 46 37]
 [ 5 15 78]]


In [None]:
len(preds)

324

In [None]:
# Attempt with siebert/sentiment-roberta-large-english (untuned)

In [None]:
model_name2 = 'nlptown/bert-base-multilingual-uncased-sentiment'
tokenizer2 = AutoTokenizer.from_pretrained(model_name)
model2 = AutoModelForSequenceClassification.from_pretrained(model_name2, num_labels=5).to(device)

In [None]:
train_chunk2 = [{'input_ids':item['input_ids'],
                'attention_mask':item['attention_mask'],
                'label':5-item['label']*2} for item in train_chunk]

test_chunk2 = [{'input_ids':item['input_ids'],
                'attention_mask':item['attention_mask'],
                'label':5-item['label']*2} for item in test_chunk]

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    metric_for_best_model="f1"
)

trainer2 = Trainer(
    model=model2,
    args=training_args,
    train_dataset=train_chunk2
)
#trainer.train()

In [None]:
trainer2.train()



Step,Training Loss
50,2.6346
100,2.595
150,2.8554


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG15XFamilyCommandBuffer: 0x17ca47d70>
    label = <none> 
    device = <AGXG15SDevice: 0x127d1ee00>
        name = Apple M3 Pro 
    commandQueue = <AGXG15XFamilyCommandQueue: 0x127a9aa00>
        label = <none> 
        device = <AGXG15SDevice: 0x127d1ee00>
            name = Apple M3 Pro 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG15XFamilyCommandBuffer: 0x17921a6e0>
    label = <none> 
    device = <AGXG15SDevice: 0x127d1ee00>
        name = Apple M3 Pro 
    commandQueue = <AGXG15XFamilyCommandQueue: 0x127a9aa00>
        label = <none> 
        device =

KeyboardInterrupt: 

In [None]:
model2.to(device)
model2.eval()

test_preds2 = []
for text in test['text'].values:
    pred = predict_document(text, model2, tokenizer2)
    test_preds2.append(pred)

In [None]:
test_preds2

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [None]:
test_labels2 = [5 - item * 2 for item in test_labels]

In [None]:
test_labels

In [None]:
print(confusion_matrix(test_labels2, test_preds2))

[[41  0  0]
 [40  0  0]
 [40  0  0]]
