In [1]:
import os
import torch
from datasets import load_dataset, load_metric
from transformers import (
    PreTrainedTokenizerFast,
    Trainer, 
    TrainingArguments, 
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
import sentencepiece as spm

In [2]:
translation_dataset = load_dataset("wmt14", "de-en", split="train[:1%]")
sentiment_dataset = load_dataset("amazon_polarity", split="train[:1%]") 

In [3]:
#Phase 2: Experiments (Fine-tuning Models)

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM  # For translation tasks, e.g., MarianMT

def load_custom_tokenizer(tokenizer_type="bpe"):
    if tokenizer_type == "bpe":
        # Load a fast tokenizer from files
        # This is a simplified example, in practice you'd wrap it with PreTrainedTokenizerFast
        return AutoTokenizer.from_pretrained("tokenizers/bpe", use_fast=True)
    elif tokenizer_type == "sp":
        return AutoTokenizer.from_pretrained("tokenizers/sp_unigram_hf", use_fast=True)
    elif tokenizer_type == "wp":
        return AutoTokenizer.from_pretrained("tokenizers/wp", use_fast=True)
    else:
        raise ValueError("Unsupported tokenizer type")

baseline_tokenizer = load_custom_tokenizer("wp")

In [4]:
split_dataset = sentiment_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

def preprocess_function_sentiment(examples):
    texts = examples["content"]
    labels = examples["label"]
    tokenized_inputs = baseline_tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=128
    )
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train = train_dataset.map(preprocess_function_sentiment, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function_sentiment, batched=True)

tokenized_train = tokenized_train.remove_columns(["content", "label"])
tokenized_eval = tokenized_eval.remove_columns(["content", "label"])

tokenized_train.set_format("torch")
tokenized_eval.set_format("torch")


Map:   0%|          | 0/28800 [00:00<?, ? examples/s]

Map:   0%|          | 0/7200 [00:00<?, ? examples/s]

In [5]:
# For translation tasks, you might use a MarianMT model or mBART, for sentiment XLM-R or mBERT.
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
model_name = "bert-base-uncased" 
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
train_args = TrainingArguments(
    output_dir="checkpoints/sentiment_wp",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="logs",
    num_train_epochs=10,          
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=(device.type == "cuda"),
    push_to_hub=False,
)

metric_accuracy = load_metric("accuracy")

  metric_accuracy = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [7]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric_accuracy.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=baseline_tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [8]:
trainer.train()

  0%|          | 0/18000 [00:00<?, ?it/s]

{'loss': 0.7067, 'grad_norm': 1.5463447570800781, 'learning_rate': 4.8611111111111115e-05, 'epoch': 0.28}
{'loss': 0.7037, 'grad_norm': 2.660998821258545, 'learning_rate': 4.722222222222222e-05, 'epoch': 0.56}
{'loss': 0.7012, 'grad_norm': 11.847258567810059, 'learning_rate': 4.5833333333333334e-05, 'epoch': 0.83}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6944168210029602, 'eval_accuracy': 0.4915277777777778, 'eval_runtime': 242.2197, 'eval_samples_per_second': 29.725, 'eval_steps_per_second': 1.858, 'epoch': 1.0}
{'loss': 0.7003, 'grad_norm': 3.52889347076416, 'learning_rate': 4.4444444444444447e-05, 'epoch': 1.11}
{'loss': 0.6974, 'grad_norm': 1.674487829208374, 'learning_rate': 4.305555555555556e-05, 'epoch': 1.39}
{'loss': 0.6977, 'grad_norm': 2.19793701171875, 'learning_rate': 4.166666666666667e-05, 'epoch': 1.67}
{'loss': 0.6961, 'grad_norm': 1.1906877756118774, 'learning_rate': 4.027777777777778e-05, 'epoch': 1.94}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6962893605232239, 'eval_accuracy': 0.4915277777777778, 'eval_runtime': 224.7011, 'eval_samples_per_second': 32.043, 'eval_steps_per_second': 2.003, 'epoch': 2.0}
{'loss': 0.6952, 'grad_norm': 3.198172092437744, 'learning_rate': 3.888888888888889e-05, 'epoch': 2.22}
{'loss': 0.6954, 'grad_norm': 1.66465163230896, 'learning_rate': 3.7500000000000003e-05, 'epoch': 2.5}
{'loss': 0.6951, 'grad_norm': 1.7826969623565674, 'learning_rate': 3.611111111111111e-05, 'epoch': 2.78}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6931620836257935, 'eval_accuracy': 0.4915277777777778, 'eval_runtime': 233.3945, 'eval_samples_per_second': 30.849, 'eval_steps_per_second': 1.928, 'epoch': 3.0}
{'loss': 0.6955, 'grad_norm': 1.1851756572723389, 'learning_rate': 3.472222222222222e-05, 'epoch': 3.06}
{'loss': 0.6946, 'grad_norm': 0.9047381281852722, 'learning_rate': 3.3333333333333335e-05, 'epoch': 3.33}
{'loss': 0.695, 'grad_norm': 0.6642925143241882, 'learning_rate': 3.194444444444444e-05, 'epoch': 3.61}
{'loss': 0.699, 'grad_norm': 1.4564002752304077, 'learning_rate': 3.055555555555556e-05, 'epoch': 3.89}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6934797763824463, 'eval_accuracy': 0.4915277777777778, 'eval_runtime': 281.4101, 'eval_samples_per_second': 25.585, 'eval_steps_per_second': 1.599, 'epoch': 4.0}
{'loss': 0.7024, 'grad_norm': 2.4821858406066895, 'learning_rate': 2.916666666666667e-05, 'epoch': 4.17}
{'loss': 0.7036, 'grad_norm': 3.5028090476989746, 'learning_rate': 2.777777777777778e-05, 'epoch': 4.44}
{'loss': 0.6985, 'grad_norm': 1.4746946096420288, 'learning_rate': 2.6388888888888892e-05, 'epoch': 4.72}
{'loss': 0.7012, 'grad_norm': 2.959958791732788, 'learning_rate': 2.5e-05, 'epoch': 5.0}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6930100321769714, 'eval_accuracy': 0.5084722222222222, 'eval_runtime': 223.9019, 'eval_samples_per_second': 32.157, 'eval_steps_per_second': 2.01, 'epoch': 5.0}
{'loss': 0.6987, 'grad_norm': 2.621695041656494, 'learning_rate': 2.361111111111111e-05, 'epoch': 5.28}
{'loss': 0.701, 'grad_norm': 2.4576234817504883, 'learning_rate': 2.2222222222222223e-05, 'epoch': 5.56}
{'loss': 0.6987, 'grad_norm': 5.230316162109375, 'learning_rate': 2.0833333333333336e-05, 'epoch': 5.83}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6930035352706909, 'eval_accuracy': 0.5084722222222222, 'eval_runtime': 215.9958, 'eval_samples_per_second': 33.334, 'eval_steps_per_second': 2.083, 'epoch': 6.0}
{'loss': 0.6962, 'grad_norm': 3.423686981201172, 'learning_rate': 1.9444444444444445e-05, 'epoch': 6.11}
{'loss': 0.6978, 'grad_norm': 4.0530500411987305, 'learning_rate': 1.8055555555555555e-05, 'epoch': 6.39}
{'loss': 0.6984, 'grad_norm': 5.10493803024292, 'learning_rate': 1.6666666666666667e-05, 'epoch': 6.67}
{'loss': 0.6962, 'grad_norm': 1.4359593391418457, 'learning_rate': 1.527777777777778e-05, 'epoch': 6.94}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6930024027824402, 'eval_accuracy': 0.5084722222222222, 'eval_runtime': 224.1352, 'eval_samples_per_second': 32.123, 'eval_steps_per_second': 2.008, 'epoch': 7.0}
{'loss': 0.6963, 'grad_norm': 1.278152346611023, 'learning_rate': 1.388888888888889e-05, 'epoch': 7.22}
{'loss': 0.6974, 'grad_norm': 1.5501155853271484, 'learning_rate': 1.25e-05, 'epoch': 7.5}
{'loss': 0.6979, 'grad_norm': 1.2144404649734497, 'learning_rate': 1.1111111111111112e-05, 'epoch': 7.78}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6930044293403625, 'eval_accuracy': 0.5084722222222222, 'eval_runtime': 218.3198, 'eval_samples_per_second': 32.979, 'eval_steps_per_second': 2.061, 'epoch': 8.0}
{'loss': 0.6988, 'grad_norm': 3.6359405517578125, 'learning_rate': 9.722222222222223e-06, 'epoch': 8.06}
{'loss': 0.6982, 'grad_norm': 5.77197265625, 'learning_rate': 8.333333333333334e-06, 'epoch': 8.33}
{'loss': 0.6968, 'grad_norm': 2.116910696029663, 'learning_rate': 6.944444444444445e-06, 'epoch': 8.61}
{'loss': 0.6962, 'grad_norm': 2.8858444690704346, 'learning_rate': 5.555555555555556e-06, 'epoch': 8.89}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6939027309417725, 'eval_accuracy': 0.5084722222222222, 'eval_runtime': 222.1538, 'eval_samples_per_second': 32.41, 'eval_steps_per_second': 2.026, 'epoch': 9.0}
{'loss': 0.6965, 'grad_norm': 3.3407113552093506, 'learning_rate': 4.166666666666667e-06, 'epoch': 9.17}
{'loss': 0.6969, 'grad_norm': 4.7902960777282715, 'learning_rate': 2.777777777777778e-06, 'epoch': 9.44}
{'loss': 0.6954, 'grad_norm': 3.3527462482452393, 'learning_rate': 1.388888888888889e-06, 'epoch': 9.72}
{'loss': 0.6939, 'grad_norm': 3.0110442638397217, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6933375597000122, 'eval_accuracy': 0.4915277777777778, 'eval_runtime': 224.1168, 'eval_samples_per_second': 32.126, 'eval_steps_per_second': 2.008, 'epoch': 10.0}
{'train_runtime': 32603.2927, 'train_samples_per_second': 8.833, 'train_steps_per_second': 0.552, 'train_loss': 0.6980445624457465, 'epoch': 10.0}


TrainOutput(global_step=18000, training_loss=0.6980445624457465, metrics={'train_runtime': 32603.2927, 'train_samples_per_second': 8.833, 'train_steps_per_second': 0.552, 'total_flos': 1.894399598592e+16, 'train_loss': 0.6980445624457465, 'epoch': 10.0})

In [9]:
results = trainer.evaluate()
print("Evaluation Results:", results)


sample_text = "I absolutely loved this product, it exceeded my expectations!"
encoded = baseline_tokenizer(sample_text, return_tensors="pt").to(device)
with torch.no_grad():
    output = model(**encoded)
    pred = output.logits.argmax(dim=-1).item()
    sentiment = "Positive" if pred == 1 else "Negative"
    print(f"Review: {sample_text}\nPredicted Sentiment: {sentiment}")

  0%|          | 0/450 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.6930024027824402, 'eval_accuracy': 0.5084722222222222, 'eval_runtime': 180.8636, 'eval_samples_per_second': 39.809, 'eval_steps_per_second': 2.488, 'epoch': 10.0}
Review: I absolutely loved this product, it exceeded my expectations!
Predicted Sentiment: Positive
