In [6]:
import os
import torch
from datasets import load_dataset, load_metric
from transformers import (
    PreTrainedTokenizerFast,
    Trainer, 
    TrainingArguments, 
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
import sentencepiece as spm

In [7]:
sentiment_dataset = load_dataset("amazon_polarity", split="train")  

In [8]:
#Phase 2: Experiments (Fine-tuning Models)

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM  # For translation tasks, e.g., MarianMT

def load_custom_tokenizer(tokenizer_type="bpe"):
    if tokenizer_type == "bpe":
        # Load a fast tokenizer from files
        # This is a simplified example, in practice you'd wrap it with PreTrainedTokenizerFast
        return AutoTokenizer.from_pretrained("tokenizers/bpe", use_fast=True)
    elif tokenizer_type == "sp":
        return AutoTokenizer.from_pretrained("tokenizers/sp_unigram_hf", use_fast=True)
    elif tokenizer_type == "wp":
        return AutoTokenizer.from_pretrained("tokenizers/wp", use_fast=True)
    else:
        raise ValueError("Unsupported tokenizer type")

baseline_tokenizer = load_custom_tokenizer("bpe")

In [9]:
split_dataset = sentiment_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

def preprocess_function_sentiment(examples):
    texts = examples["content"]
    labels = examples["label"]
    tokenized_inputs = baseline_tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=128
    )
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train = train_dataset.map(preprocess_function_sentiment, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function_sentiment, batched=True)

tokenized_train = tokenized_train.remove_columns(["content", "label"])
tokenized_eval = tokenized_eval.remove_columns(["content", "label"])

tokenized_train.set_format("torch")
tokenized_eval.set_format("torch")


Map:   0%|          | 0/2880000 [00:00<?, ? examples/s]

Map:   0%|          | 0/720000 [00:00<?, ? examples/s]

In [10]:
# For translation tasks, you might use a MarianMT model or mBART, for sentiment XLM-R or mBERT.
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
model_name = "bert-base-uncased" 
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
train_args = TrainingArguments(
    output_dir="checkpoints/sentiment_bpe",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="logs",
    num_train_epochs=5,          
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=(device.type == "cuda"),
    push_to_hub=False,
)

metric_accuracy = load_metric("accuracy")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric_accuracy.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=baseline_tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [15]:
trainer.train()

  0%|          | 0/900000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:
results = trainer.evaluate()
print("Evaluation Results:", results)


sample_text = "I absolutely loved this product, it exceeded my expectations!"
encoded = baseline_tokenizer(sample_text, return_tensors="pt").to(device)
with torch.no_grad():
    output = model(**encoded)
    pred = output.logits.argmax(dim=-1).item()
    sentiment = "Positive" if pred == 1 else "Negative"
    print(f"Review: {sample_text}\nPredicted Sentiment: {sentiment}")

  0%|          | 0/450 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.6930168271064758, 'eval_accuracy': 0.5084722222222222, 'eval_runtime': 109.5626, 'eval_samples_per_second': 65.716, 'eval_steps_per_second': 4.107, 'epoch': 10.0}
Review: I absolutely loved this product, it exceeded my expectations!
Predicted Sentiment: Positive
