<a href="https://colab.research.google.com/github/Parjival/Coding/blob/main/Challenge1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as py
!pip install transformers datasets



In [None]:
from datasets import load_dataset

dataset = load_dataset("SKNahin/bengali-transliteration-data")

train_test_split = dataset['train'].train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

Training samples: 4004
Validation samples: 1002


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

def preprocess_function(examples):

    banglish_column = train_dataset.column_names[0]
    bangla_column = train_dataset.column_names[1]

    inputs = examples[banglish_column]
    targets = examples[bangla_column]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)



Map:   0%|          | 0/4004 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

In [None]:
from transformers import MT5ForConditionalGeneration

model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")

In [None]:
from transformers import Seq2SeqTrainer,Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=200,
    save_total_limit=2,
    predict_with_generate=True
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)



  trainer = Seq2SeqTrainer(


In [None]:
import torch

batch = train_dataset[0:2]

batch = {
    "input_ids": torch.tensor(batch["input_ids"]),
    "labels": torch.tensor(batch["labels"])
}

model_output = model(input_ids=batch["input_ids"], labels=batch["labels"])


print("Model Output Loss:", model_output.loss)

Model Output Loss: tensor(61.8264, grad_fn=<NllLossBackward0>)


In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(val_dataset)


decoded_preds = []
for pred in predictions.predictions:
    try:
        decoded_pred = tokenizer.decode(pred, skip_special_tokens=True)
    except OverflowError:

        decoded_pred = "[UKN]"
    decoded_preds.append(decoded_pred)

decoded_labels = []
for label in predictions.label_ids:
    try:
        decoded_label = tokenizer.decode(label, skip_special_tokens=True)
    except OverflowError:
        decoded_label = "[UKN]"
    decoded_labels.append(decoded_label)


for i in range(5):
    print(f"Input: {val_dataset[i]['Banglish']}")
    print(f"Prediction: {decoded_preds[i]}")
    print(f"Target: {decoded_labels[i]}")
    print("-" * 30)

In [None]:

model.save_pretrained("./banglish_to_bangla")
tokenizer.save_pretrained("./banglish_to_bangla")