In [1]:
import os
import torch
from datasets import load_dataset, load_metric
from transformers import (
    PreTrainedTokenizerFast,
    Trainer, 
    TrainingArguments, 
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
import sentencepiece as spm

In [2]:
translation_dataset = load_dataset("wmt14", "de-en", split="train[:1%]")
sentiment_dataset = load_dataset("amazon_polarity", split="train[:1%]")  # Just an example in English, replace with multilingual.

In [3]:
#Phase 2: Experiments (Fine-tuning Models)

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM  # For translation tasks, e.g., MarianMT

def load_custom_tokenizer(tokenizer_type="bpe"):
    if tokenizer_type == "bpe":
        # Load a fast tokenizer from files
        # This is a simplified example, in practice you'd wrap it with PreTrainedTokenizerFast
        return AutoTokenizer.from_pretrained("tokenizers/bpe", use_fast=True)
    elif tokenizer_type == "sp":
        return AutoTokenizer.from_pretrained("tokenizers/sp_unigram_hf", use_fast=True)
    elif tokenizer_type == "wp":
        return AutoTokenizer.from_pretrained("tokenizers/wp", use_fast=True)
    else:
        raise ValueError("Unsupported tokenizer type")

baseline_tokenizer = load_custom_tokenizer("sp")

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [4]:
split_dataset = sentiment_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

def preprocess_function_sentiment(examples):
    texts = examples["content"]
    labels = examples["label"]
    tokenized_inputs = baseline_tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=128
    )
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train = train_dataset.map(preprocess_function_sentiment, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function_sentiment, batched=True)

tokenized_train = tokenized_train.remove_columns(["content", "label"])
tokenized_eval = tokenized_eval.remove_columns(["content", "label"])

tokenized_train.set_format("torch")
tokenized_eval.set_format("torch")


Map:   0%|          | 0/28800 [00:00<?, ? examples/s]

Map:   0%|          | 0/7200 [00:00<?, ? examples/s]

In [5]:
# For translation tasks, you might use a MarianMT model or mBART, for sentiment XLM-R or mBERT.
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
model_name = "bert-base-uncased" 
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
train_args = TrainingArguments(
    output_dir="checkpoints/sentiment_sp",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="logs",
    num_train_epochs=10,          
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=(device.type == "cuda"),
    push_to_hub=False,
)

metric_accuracy = load_metric("accuracy")

  metric_accuracy = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [7]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric_accuracy.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=baseline_tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [8]:
trainer.train()

  0%|          | 0/18000 [00:00<?, ?it/s]

{'loss': 0.6997, 'grad_norm': 1.7761789560317993, 'learning_rate': 4.8611111111111115e-05, 'epoch': 0.28}
{'loss': 0.6975, 'grad_norm': 1.7528256177902222, 'learning_rate': 4.722222222222222e-05, 'epoch': 0.56}
{'loss': 0.6958, 'grad_norm': 5.4670729637146, 'learning_rate': 4.5833333333333334e-05, 'epoch': 0.83}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.697838544845581, 'eval_accuracy': 0.4915277777777778, 'eval_runtime': 239.2729, 'eval_samples_per_second': 30.091, 'eval_steps_per_second': 1.881, 'epoch': 1.0}
{'loss': 0.6952, 'grad_norm': 1.9082987308502197, 'learning_rate': 4.4444444444444447e-05, 'epoch': 1.11}
{'loss': 0.6981, 'grad_norm': 1.2722952365875244, 'learning_rate': 4.305555555555556e-05, 'epoch': 1.39}
{'loss': 0.6961, 'grad_norm': 2.096461296081543, 'learning_rate': 4.166666666666667e-05, 'epoch': 1.67}
{'loss': 0.696, 'grad_norm': 1.2440341711044312, 'learning_rate': 4.027777777777778e-05, 'epoch': 1.94}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6950634121894836, 'eval_accuracy': 0.4915277777777778, 'eval_runtime': 223.3938, 'eval_samples_per_second': 32.23, 'eval_steps_per_second': 2.014, 'epoch': 2.0}
{'loss': 0.6953, 'grad_norm': 2.9187896251678467, 'learning_rate': 3.888888888888889e-05, 'epoch': 2.22}
{'loss': 0.6947, 'grad_norm': 1.654943585395813, 'learning_rate': 3.7500000000000003e-05, 'epoch': 2.5}
{'loss': 0.6948, 'grad_norm': 1.7665257453918457, 'learning_rate': 3.611111111111111e-05, 'epoch': 2.78}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6930907368659973, 'eval_accuracy': 0.5084722222222222, 'eval_runtime': 242.7198, 'eval_samples_per_second': 29.664, 'eval_steps_per_second': 1.854, 'epoch': 3.0}
{'loss': 0.6954, 'grad_norm': 1.2543911933898926, 'learning_rate': 3.472222222222222e-05, 'epoch': 3.06}
{'loss': 0.6939, 'grad_norm': 0.9448590874671936, 'learning_rate': 3.3333333333333335e-05, 'epoch': 3.33}
{'loss': 0.6946, 'grad_norm': 0.6667232513427734, 'learning_rate': 3.194444444444444e-05, 'epoch': 3.61}
{'loss': 0.6948, 'grad_norm': 0.7234439849853516, 'learning_rate': 3.055555555555556e-05, 'epoch': 3.89}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6938490271568298, 'eval_accuracy': 0.5084722222222222, 'eval_runtime': 225.1059, 'eval_samples_per_second': 31.985, 'eval_steps_per_second': 1.999, 'epoch': 4.0}
{'loss': 0.6943, 'grad_norm': 1.3128876686096191, 'learning_rate': 2.916666666666667e-05, 'epoch': 4.17}
{'loss': 0.6952, 'grad_norm': 1.4523448944091797, 'learning_rate': 2.777777777777778e-05, 'epoch': 4.44}
{'loss': 0.6944, 'grad_norm': 1.024256944656372, 'learning_rate': 2.6388888888888892e-05, 'epoch': 4.72}
{'loss': 0.6945, 'grad_norm': 0.9965711832046509, 'learning_rate': 2.5e-05, 'epoch': 5.0}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6930047869682312, 'eval_accuracy': 0.5084722222222222, 'eval_runtime': 222.7672, 'eval_samples_per_second': 32.321, 'eval_steps_per_second': 2.02, 'epoch': 5.0}
{'loss': 0.6945, 'grad_norm': 0.9632492065429688, 'learning_rate': 2.361111111111111e-05, 'epoch': 5.28}
{'loss': 0.6946, 'grad_norm': 0.8889995813369751, 'learning_rate': 2.2222222222222223e-05, 'epoch': 5.56}
{'loss': 0.6941, 'grad_norm': 1.8074257373809814, 'learning_rate': 2.0833333333333336e-05, 'epoch': 5.83}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6942467093467712, 'eval_accuracy': 0.4915277777777778, 'eval_runtime': 212.3755, 'eval_samples_per_second': 33.902, 'eval_steps_per_second': 2.119, 'epoch': 6.0}
{'loss': 0.6938, 'grad_norm': 1.536942720413208, 'learning_rate': 1.9444444444444445e-05, 'epoch': 6.11}
{'loss': 0.6938, 'grad_norm': 1.7304617166519165, 'learning_rate': 1.8055555555555555e-05, 'epoch': 6.39}
{'loss': 0.6946, 'grad_norm': 1.508159875869751, 'learning_rate': 1.6666666666666667e-05, 'epoch': 6.67}
{'loss': 0.6934, 'grad_norm': 0.845543622970581, 'learning_rate': 1.527777777777778e-05, 'epoch': 6.94}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6931324601173401, 'eval_accuracy': 0.5084722222222222, 'eval_runtime': 225.0046, 'eval_samples_per_second': 31.999, 'eval_steps_per_second': 2.0, 'epoch': 7.0}
{'loss': 0.6939, 'grad_norm': 0.7339354753494263, 'learning_rate': 1.388888888888889e-05, 'epoch': 7.22}
{'loss': 0.6936, 'grad_norm': 0.6099230647087097, 'learning_rate': 1.25e-05, 'epoch': 7.5}
{'loss': 0.6936, 'grad_norm': 0.5457590222358704, 'learning_rate': 1.1111111111111112e-05, 'epoch': 7.78}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6931387186050415, 'eval_accuracy': 0.5084722222222222, 'eval_runtime': 219.5334, 'eval_samples_per_second': 32.797, 'eval_steps_per_second': 2.05, 'epoch': 8.0}
{'loss': 0.6937, 'grad_norm': 1.352515459060669, 'learning_rate': 9.722222222222223e-06, 'epoch': 8.06}
{'loss': 0.6938, 'grad_norm': 1.9672133922576904, 'learning_rate': 8.333333333333334e-06, 'epoch': 8.33}
{'loss': 0.6935, 'grad_norm': 0.9132937788963318, 'learning_rate': 6.944444444444445e-06, 'epoch': 8.61}
{'loss': 0.6933, 'grad_norm': 1.405375361442566, 'learning_rate': 5.555555555555556e-06, 'epoch': 8.89}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.693058967590332, 'eval_accuracy': 0.5084722222222222, 'eval_runtime': 236.6355, 'eval_samples_per_second': 30.427, 'eval_steps_per_second': 1.902, 'epoch': 9.0}
{'loss': 0.694, 'grad_norm': 1.4425933361053467, 'learning_rate': 4.166666666666667e-06, 'epoch': 9.17}
{'loss': 0.6934, 'grad_norm': 2.410526990890503, 'learning_rate': 2.777777777777778e-06, 'epoch': 9.44}
{'loss': 0.6928, 'grad_norm': 1.5421983003616333, 'learning_rate': 1.388888888888889e-06, 'epoch': 9.72}
{'loss': 0.6936, 'grad_norm': 1.473081350326538, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.6930209398269653, 'eval_accuracy': 0.5084722222222222, 'eval_runtime': 118.9082, 'eval_samples_per_second': 60.551, 'eval_steps_per_second': 3.784, 'epoch': 10.0}
{'train_runtime': 33435.6222, 'train_samples_per_second': 8.614, 'train_steps_per_second': 0.538, 'train_loss': 0.6946745435926649, 'epoch': 10.0}


TrainOutput(global_step=18000, training_loss=0.6946745435926649, metrics={'train_runtime': 33435.6222, 'train_samples_per_second': 8.614, 'train_steps_per_second': 0.538, 'total_flos': 1.894399598592e+16, 'train_loss': 0.6946745435926649, 'epoch': 10.0})

In [9]:
results = trainer.evaluate()
print("Evaluation Results:", results)


sample_text = "I absolutely loved this product, it exceeded my expectations!"
encoded = baseline_tokenizer(sample_text, return_tensors="pt").to(device)
with torch.no_grad():
    output = model(**encoded)
    pred = output.logits.argmax(dim=-1).item()
    sentiment = "Positive" if pred == 1 else "Negative"
    print(f"Review: {sample_text}\nPredicted Sentiment: {sentiment}")

  0%|          | 0/450 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.6930047869682312, 'eval_accuracy': 0.5084722222222222, 'eval_runtime': 180.6422, 'eval_samples_per_second': 39.858, 'eval_steps_per_second': 2.491, 'epoch': 10.0}
Review: I absolutely loved this product, it exceeded my expectations!
Predicted Sentiment: Positive
