## **Importing Dependencies**

In [1]:
import torch
print(torch.cuda.is_available())  
print(torch.cuda.device_count()) 
print(torch.cuda.get_device_name(0)) 


True
1
NVIDIA GeForce RTX 2060


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from sacrebleu.metrics import BLEU

import warnings
warnings.filterwarnings('ignore')

if torch.cuda.is_available():
    print("CUDA is available!  Training on GPU ...")
else:
    print("CUDA is not available.  Training on CPU ...")

  from .autonotebook import tqdm as notebook_tqdm


CUDA is available!  Training on GPU ...


## **Loading Dataset**

In [3]:
dataset = load_dataset("opus_books","en-es")

In [5]:
print(dataset["train"][8])

{'id': '8', 'translation': {'en': "In the society of his nephew and niece, and their children, the old Gentleman's days were comfortably spent.", 'es': 'En compañía de su sobrino y sobrina, y de los hijos de ambos, la vida transcurrió confortablemente para el anciano caballero.'}}


In [6]:
## Splitting the dataset into train and test

dataset = dataset["train"].train_test_split(test_size=0.2)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

## **Tokenization**

`Tokenization is the process of breaking down text into smaller units, like words or subwords, which our transformer model can understand and process.`

**Why Tokenization Matters**

`Transformer models don't work directly with raw text. They require numerical representations of the text, and tokenization is the bridge between the two. Each word or subword is assigned a unique numerical ID, allowing the model to manipulate these IDs for translation.`

In [7]:
# Model checkpoint for English-to-Spanish translation
model_checkpoint = "Helsinki-NLP/opus-mt-en-es"

# Load the tokenizer
tokenizer  = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
def tokenize_function(examples):
    # Check if 'translation' is a dictionary or a list of dictionaries
    if isinstance(examples["translation"], dict):
        # Single example: Create a list with the single dictionary
        examples["translation"] = [examples["translation"]]

    return tokenizer(
        [x["en"] for x in examples["translation"]],
        text_target=[x["es"] for x in examples["translation"]],
        padding="max_length",
        truncation=True,
    )


In [9]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 74776/74776 [00:42<00:00, 1754.56 examples/s]
Map: 100%|██████████| 18694/18694 [00:10<00:00, 1857.85 examples/s]


In [10]:
print(tokenized_dataset['train'][2])
print(tokenized_dataset['test'][2])

{'id': '73854', 'translation': {'en': '"Master is well aware," Conseil replied, "that I\'m not seasoned in practical application.', 'es': '-El señor sabe muy bien que la práctica no es mi dominio.'}, 'input_ids': [52, 45941, 31, 255, 4217, 1896, 13307, 110, 1610, 17412, 2, 52, 9764, 33, 20, 92, 64, 6341, 118, 16, 4101, 1177, 3, 0, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65

**Dataloader**

In [11]:
from torch.utils.data import DataLoader

# Prepare for dataloader
train_dataset = tokenized_dataset["train"].shuffle(seed=42).with_format("torch")
val_dataset = tokenized_dataset["test"].shuffle(seed=42).with_format("torch")

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(val_dataset, batch_size=8)

**Loading Pre-trained model**

* Model: MarianMT
* Checkpoint: Helsinki-NLP/opus-mt-en-es

In [12]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

## **Fine-Tuning**

In [13]:
from transformers import (
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

# Data Collator to prepare batches
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, label_pad_token_id=tokenizer.pad_token_id
)


# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True # Use mixed precision if your GPU supports it
)


## Create the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)


## Start Training
trainer.train()

  0%|          | 1/14022 [00:41<160:49:34, 41.29s/it]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.98 GiB. GPU 