In [1]:
%pip install datasets transformers accelerate


Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset

dataset = load_dataset("SKNahin/bengali-transliteration-data")


In [3]:
print(dataset)
# e.g. DatasetDict({
#   train: Dataset({
#       features: ['banglish', 'bangla'],
#       num_rows: ...
#   })
#   test: Dataset({
#       features: ['banglish', 'bangla'],
#       num_rows: ...
#   })
# })


DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 5006
    })
})


In [4]:
print(dataset["train"][0])  # e.g. {'bn': 'আমি', 'rm': 'ami'}
print(dataset["train"][1])  # e.g. ...


{'bn': 'স্ক্রোল করে ২০/৩০ সেকেন্ড এর ভিডিও পান নাই???', 'rm': 'scroll kore 20/30 second er video pann nai???'}
{'bn': 'ও গুলা টরেন্ট সাইট এ পাবেন', 'rm': 'o gula Torrent site e paben'}


In [5]:
# Rename columns for clarity
dataset = dataset.rename_column("bn", "bangla")
dataset = dataset.rename_column("rm", "banglish")

# Now the DatasetDict has features: ["bangla", "banglish"].
print(dataset["train"][0])
# e.g. {"bangla": "আমি", "banglish": "ami"}


{'bangla': 'স্ক্রোল করে ২০/৩০ সেকেন্ড এর ভিডিও পান নাই???', 'banglish': 'scroll kore 20/30 second er video pann nai???'}


In [6]:
split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

print(train_dataset)
print(val_dataset)
# Each is now a Dataset object with a portion of the data.


Dataset({
    features: ['bangla', 'banglish'],
    num_rows: 4505
})
Dataset({
    features: ['bangla', 'banglish'],
    num_rows: 501
})


In [7]:
def clean_function(example):
    example["banglish"] = example["banglish"].strip()
    example["bangla"]   = example["bangla"].strip()
    return example

train_dataset = train_dataset.map(clean_function)
val_dataset   = val_dataset.map(clean_function)

# Inspect after cleaning
print(train_dataset[0])


{'bangla': 'আপনার এফবি আইডি নেম বিশাল আর এইখানে মামুন কেন ?', 'banglish': 'Apnar fb id name Bishal ar Ekhane mamun keno ?'}


In [8]:
# Ensure sentencepiece is installed
%pip install sentencepiece


Note: you may need to restart the kernel to use updated packages.


In [9]:
%pip install --upgrade transformers huggingface_hub


Collecting transformers
  Using cached transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
Collecting huggingface_hub
  Using cached huggingface_hub-0.27.0-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Using cached transformers-4.47.1-py3-none-any.whl (10.1 MB)
Using cached huggingface_hub-0.27.0-py3-none-any.whl (450 kB)
Using cached tokenizers-0.21.0-cp39-abi3-win_amd64.whl (2.4 MB)
Installing collected packages: huggingface_hub, tokenizers, transformers
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.25.2
    Uninstalling huggingface-hub-0.25.2:
      Successfully uninstalled huggingface-hub-0.25.2
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.1
    Uninstalling tokenizers-0.20.1:
      Successfully uninstalled tokenizers-0.20.1
  Attempting uninstall: transformers
    Found existing in

In [10]:
# %pip install tensorflow


In [15]:
from transformers import MT5ForConditionalGeneration, T5Tokenizer

model_name = "google/mt5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)


pytorch_model.bin:  76%|#######5  | 912M/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [20]:
def preprocess_function(examples):
    inputs = examples["banglish"]
    targets = examples["bangla"]
    model_inputs = tokenizer(
        inputs,
        max_length=128,  # Adjust max_length based on your dataset
        truncation=True,
        padding="max_length"  # Ensures all sequences in a batch are of the same length
    )
    labels = tokenizer(
        targets,
        max_length=128,  # Ensure this matches the max_length of inputs
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [21]:
train_tokenized = train_dataset.map(preprocess_function, batched=True)
val_tokenized = val_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/4505 [00:00<?, ? examples/s]

Map:   0%|          | 0/501 [00:00<?, ? examples/s]

In [22]:
print(train_tokenized[0])


{'bangla': 'আপনার এফবি আইডি নেম বিশাল আর এইখানে মামুন কেন ?', 'banglish': 'Apnar fb id name Bishal ar Ekhane mamun keno ?', 'input_ids': [6220, 4480, 49895, 259, 525, 6535, 154205, 473, 798, 415, 182138, 6356, 604, 513, 505, 259, 291, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [10045, 8053, 259

In [31]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Reduce batch size
    per_device_eval_batch_size=4,  # Reduce batch size
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
)



In [32]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [None]:
trainer.train()