In [3]:
import torch
from transformers import T5Tokenizer, MT5ForConditionalGeneration, AdamW
from nltk.translate.bleu_score import sentence_bleu
from torch.optim.lr_scheduler import ReduceLROnPlateau
import random


In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [7]:
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
input_text = "Translate the given text from English to French: why do we run for future rather than living the present"

In [9]:
input_ids=t5_tokenizer.encode(input_text, return_tensors='pt')

In [10]:
input_ids

tensor([[30355,    15,     8,   787,  1499,    45,  1566,    12,  2379,    10,
           572,   103,    62,   661,    21,   647,  1066,   145,   840,     8,
           915,     1]])

In [11]:
outputs = t5_model.generate(input_ids)
output_text = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)



In [12]:
output_text

"Pourquoi allons-nous courir pour l'avenir plutôt que pour vivre le"

In [4]:
# Load the mT5 model and tokenizer for multilingual support
model_name = "google/mt5-base"  # Use mT5 for multilingual support
tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

In [1]:

# Preprocess the data
def preprocess_data(data_file):
    english_sentences = []
    tamil_sentences = []
    with open(data_file, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                en, ta, _ = line.split('\t')
                english_sentences.append(f"translate English to Tamil: {en.strip()}")
                tamil_sentences.append(ta.strip())
    return english_sentences, tamil_sentences

# Load the dataset
data_file = "tam.txt"
english_sentences, tamil_sentences = preprocess_data(data_file)

# Training loop with teacher forcing and batching
def train_model(model, optimizer, scheduler, english_sentences, tamil_sentences, num_epochs, batch_size, device):
    model.to(device)
    for epoch in range(num_epochs):
        total_loss = 0.0
        # Shuffle the data before each epoch for randomness
        data = list(zip(english_sentences, tamil_sentences))
        random.shuffle(data)
        english_sentences, tamil_sentences = zip(*data)

        for i in range(0, len(english_sentences), batch_size):
            batch_english_sentences = english_sentences[i:i+batch_size]
            batch_tamil_sentences = tamil_sentences[i:i+batch_size]

            if len(batch_english_sentences) == 0:  # Skip empty batches
                continue

            batch_english_input_ids = tokenizer(batch_english_sentences, padding=True, return_tensors="pt", max_length=512, truncation=True)["input_ids"].to(device)
            batch_tamil_input_ids = tokenizer(batch_tamil_sentences, padding=True, return_tensors="pt", max_length=512, truncation=True)["input_ids"].to(device)

            # Model forward pass with teacher forcing
            outputs = model(input_ids=batch_english_input_ids, labels=batch_tamil_input_ids)
            loss = outputs.loss
            total_loss += loss.item()

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        average_loss = total_loss / (len(english_sentences) // batch_size)
        print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {average_loss:.4f}")
        scheduler.step(average_loss)

# Set training parameters
num_epochs = 30  # Increase epochs for better learning
batch_size = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, no_deprecation_warning=True)
scheduler = ReduceLROnPlateau(optimizer, patience=3, factor=0.1)

# Train the model
train_model(model, optimizer, scheduler, english_sentences, tamil_sentences, num_epochs, batch_size, device)

# Save the fine-tuned model
def save_model_contiguously(model, save_path):
    for name, param in model.named_parameters():
        if not param.is_contiguous():
            param.data = param.data.contiguous()
    model.save_pretrained(save_path)

save_model_contiguously(model, "mt5-tamil")

# Evaluate the model (Optional)
def evaluate_model(model, english_sentences, tamil_sentences):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        total_bleu_score = 0.0
        for english_sentence, tamil_sentence in zip(english_sentences, tamil_sentences):
            input_ids = tokenizer(english_sentence, return_tensors="pt", max_length=512, truncation=True).input_ids.to(device)
            outputs = model.generate(input_ids, max_length=512, num_beams=5, early_stopping=True)
            predicted_tamil = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Calculate BLEU score
            reference = tamil_sentence.split()  # Reference tokenized sentence
            candidate = predicted_tamil.split()  # Candidate tokenized sentence
            bleu_score = sentence_bleu([reference], candidate)
            total_bleu_score += bleu_score

            print(f"English: {english_sentence}")
            print(f"Actual Tamil: {tamil_sentence}")
            print(f"Predicted Tamil: {predicted_tamil}")
            print(f"BLEU Score: {bleu_score:.4f}")
            print()

        avg_bleu = total_bleu_score / len(english_sentences)
        print(f"Average BLEU Score: {avg_bleu:.4f}")

# Evaluate the model on a subset of examples
evaluate_model(model, english_sentences[:5], tamil_sentences[:5])

# Translation function
def translate_to_tamil(english_sentence):
    input_sentence = f"translate English to Tamil: {english_sentence}"
    input_ids = tokenizer(input_sentence, return_tensors="pt", max_length=512, truncation=True).input_ids.to(device)
    outputs = model.generate(input_ids=input_ids, max_length=512, num_beams=5, early_stopping=True)
    predicted_tamil = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return predicted_tamil

# Test translation
custom_english = "I am very happy now."
translated_tamil = translate_to_tamil(custom_english)
print(f"Translated Tamil: {translated_tamil}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Epoch 1/30, Average Loss: 29.1696
Epoch 2/30, Average Loss: 20.3454
Epoch 3/30, Average Loss: 15.0821
Epoch 4/30, Average Loss: 10.4251
Epoch 5/30, Average Loss: 9.3051
Epoch 6/30, Average Loss: 8.4694
Epoch 7/30, Average Loss: 7.5820
Epoch 8/30, Average Loss: 6.7872
Epoch 9/30, Average Loss: 6.8328
Epoch 10/30, Average Loss: 6.4486
Epoch 11/30, Average Loss: 6.0714
Epoch 12/30, Average Loss: 5.3969
Epoch 13/30, Average Loss: 5.3445
Epoch 14/30, Average Loss: 5.1336
Epoch 15/30, Average Loss: 4.7097
Epoch 16/30, Average Loss: 4.5061
Epoch 17/30, Average Loss: 4.0370
Epoch 18/30, Average Loss: 3.2352
Epoch 19/30, Average Loss: 2.1659
Epoch 20/30, Average Loss: 1.3603
Epoch 21/30, Average Loss: 1.0550
Epoch 22/30, Average Loss: 0.6056
Epoch 23/30, Average Loss: 0.5019
Epoch 24/30, Average Loss: 0.4185
Epoch 25/30, Average Loss: 0.3920
Epoch 26/30, Average Loss: 0.3409
Epoch 27/30, Average Loss: 0.2929
Epoch 28/30, Average Loss: 0.2980
Epoch 29/30, Average Loss: 0.2798
Epoch 30/30, Averag

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


English: translate English to Tamil: I slept.
Actual Tamil: நான் தூங்கினேன்.
Predicted Tamil: நான் தூங்கினேன்.
BLEU Score: 0.0000

English: translate English to Tamil: Calm down.
Actual Tamil: அமைதியாக இருங்கள்
Predicted Tamil: அமர அழியுங்கள்
BLEU Score: 0.0000



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


English: translate English to Tamil: I'll walk.
Actual Tamil: நான் நடப்பேன்.
Predicted Tamil: நான் நடந்து செல்ல வேண்டுமென்று எண்ணுகிறேன்
BLEU Score: 0.0000

English: translate English to Tamil: Who is he?
Actual Tamil: அவன் யார்?
Predicted Tamil: யார் எங்களது யார்?
BLEU Score: 0.0000

English: translate English to Tamil: Who knows?
Actual Tamil: யாருக்குத் தெரியும்?
Predicted Tamil: யார் என்ன  ?
BLEU Score: 0.0000

Average BLEU Score: 0.0000
Translated Tamil: நான் தற்போது மகிழ்ச்சியாக இருக்கிறேன்
