## install libraries

In [None]:
# Install the transformers library
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## import libraries

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
#from datasets import load_metric
import random
import pandas as pd
import os

## load data

In [None]:
file_path = "/content/drive/MyDrive/dev.csv"
df = pd.read_csv(file_path)

In [None]:
english_sentences=df['en_text']
persian_sentences=df['fa_text']

## define Data Tokenization function for dataset

In [None]:

# Define a custom Dataset class
class TranslationDataset(Dataset):
    def __init__(self, tokenizer, source_texts, target_texts, max_length=128):
        self.tokenizer = tokenizer
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.max_length = max_length

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, index):
        source_text = self.source_texts[index]
        target_text = self.target_texts[index]

        # Tokenize the source and target texts
        source_encoding = self.tokenizer(
            source_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        labels = target_encoding["input_ids"]
        labels[labels == self.tokenizer.pad_token_id] = -100  # Replace padding token id's in labels by -100
        return {
            "input_ids": source_encoding["input_ids"].squeeze(),
            "attention_mask": source_encoding["attention_mask"].squeeze(),
            "labels": labels.squeeze(),
        }


## Initialize the model and tokenizer

In [None]:

model_name = "persiannlp/mt5-small-parsinlu-translation_en_fa"  # Use t5-small to save resources
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [None]:
# List of English sentences and corresponding Persian sentences for training
source_texts = english_sentences

target_texts = persian_sentences
# Prepare the dataset
train_dataset = TranslationDataset(tokenizer, source_texts, target_texts)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

## model training

In [None]:


# Training Loop
num_epochs = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs} completed. Loss: {loss.item()}")







Epoch 1/4 completed. Loss: 3.2383179664611816
Epoch 2/4 completed. Loss: 2.3874683380126953
Epoch 3/4 completed. Loss: 2.077216625213623
Epoch 4/4 completed. Loss: 2.5593278408050537


In [None]:
#torch.save(model, '/content/drive/MyDrive/t5_translator')

In [None]:
for param in model.parameters():
    param.data = param.data.contiguous()

model.save_pretrained('/content/drive/MyDrive/t5_translator1')

## Test the translation

In [None]:
model = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/t5_translator1').to(device)
def translate_text(text):
    model.eval()
    input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(input_ids)
    decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_text

# Example test
test_sentence = "How is the weather today?"
translated_sentence = translate_text(test_sentence)
print(f"English: {test_sentence}")
print(f"Persian: {translated_sentence}")



# Optionally, zip and download the model
!zip -r t5_english_persian_model.zip t5_english_persian_model



English: How is the weather today?
Persian: هوا چطوره؟

zip error: Nothing to do! (try: zip -r t5_english_persian_model.zip . -i t5_english_persian_model)


# t5 output without finetuning

In [None]:
from transformers import pipeline

In [None]:
translator = pipeline("translation_en_to_fa", model="persiannlp/mt5-small-parsinlu-translation_en_fa")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
