In [1]:
pip install datasets


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration


In [3]:
tqdm.pandas()

In [4]:
from datasets import load_dataset

# Load the dataset
ds = load_dataset("SKNahin/bengali-transliteration-data")

# Inspect the dataset
print(ds)


README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 5006
    })
})


In [5]:
df = pd.DataFrame({
    "Banglish" : ds["train"]["rm"],
    "Bangla" : ds["train"]["bn"]
})

In [6]:
df

Unnamed: 0,Banglish,Bangla
0,scroll kore 20/30 second er video pann nai???,স্ক্রোল করে ২০/৩০ সেকেন্ড এর ভিডিও পান নাই???
1,o gula Torrent site e paben,ও গুলা টরেন্ট সাইট এ পাবেন
2,vokkor chokkor post akta korlei approved…. nis...,ভক্কর চক্কর পোস্ট একটা করলেই এপ্রুভড.… নিশ্চই ...
3,ami test koreii code disi…,আমি টেস্ট করেই কোড দিছি…
4,"eto koster ki ache shakibwap.tk,shakibwap.ml&a...","এতো কষ্টের কি আছে সাকিবওয়াপ.টক,সাকিবওয়াপ.মল&এআ..."
...,...,...
5001,Eder ke mati chapa deya dorkar,এদের কে মাটি চাপা দেয়া দরকার
5002,Oder ekdin bichar hobe vai,ওদের একদিন বিচার হবে ভাই
5003,Opekkha koren,অপেক্ষা করেন
5004,Shomoy ashbe,সময় আসবে


## Cleaning Data

In [7]:
def preprocess(text):
  if not isinstance(text, (str, bytes)):
        return ""  # Return empty string if text is not a string or bytes-like object
  text = text.lower()
  return text

In [8]:
df['clean_text'] = df['Banglish'].progress_apply(lambda x: preprocess(x))
df.sample(5)

100%|██████████| 5006/5006 [00:00<00:00, 389397.19it/s]


Unnamed: 0,Banglish,Bangla,clean_text
2504,"google ei paben, search kore dekhun…","গুগল এই পাবেন, সার্চ করে দেখুন…","google ei paben, search kore dekhun…"
726,shesher ta valo lageni,শেষেব় টা ভাল লাগেনি,shesher ta valo lageni
4730,dekhe to mone mone hosse sob phn er system,দেখে তো মনে মনে হচ্ছে সব ফোন এর সিস্টেম,dekhe to mone mone hosse sob phn er system
4079,apni post korsen,আপনি পোস্ট করছেন,apni post korsen
2122,Ekhon ki paypal add korar option ache?,এখন কি পে পাল এড করার অপ্সন আছে?,ekhon ki paypal add korar option ache?


In [9]:
df = pd.DataFrame({
    "Banglish" : df["clean_text"],
    "Bangla" : df["Bangla"]
})

In [10]:
df_train = df.sample(frac=0.8, random_state=42)
df_val = df.drop(df_train.index)


df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)

In [11]:
df_train.shape

(4005, 2)

In [12]:
df_val.shape

(1001, 2)

In [13]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset

raw_datasets = Dataset.from_dict(df).train_test_split(test_size=0.1)

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

def preprocess_function(example):
    source = ["transliterate: " + text for text in example["Banglish"]]
    target = example["Bangla"]
    model_inputs = tokenizer(source, max_length=250, truncation=True)
    labels = tokenizer(target, max_length=250, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=10,
    predict_with_generate=True,
    report_to="none",
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

# Train the Model
trainer.train()


In [None]:
from transformers import MBartForConditionalGeneration, MBart50Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset

# Split your dataset into training and testing
raw_datasets = Dataset.from_dict(df).train_test_split(test_size=0.1)

# Load the mBART model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Set the source and target language codes
tokenizer.src_lang = "en_XX"  # Source language (Banglish-like, treated as English script)
tokenizer.tgt_lang = "bn_IN"  # Target language (Bengali)

# Preprocessing Function
def preprocess_function(example):
    # Add the mBART language prefix and tokenize
    source = ["transliterate: " + text for text in example["Banglish"]]
    target = example["Bangla"]
    model_inputs = tokenizer(source, max_length=50, truncation=True)
    labels = tokenizer(target, max_length=50, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the datasets
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

# Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=30,
    predict_with_generate=True,
    report_to="none",  # Disable reporting (e.g., W&B)
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

# Train the Model
trainer.train()


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Map:   0%|          | 0/4505 [00:00<?, ? examples/s]

Map:   0%|          | 0/501 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,1.314641
2,1.867400,0.986037


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


In [None]:
import torch

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set source and target language codes
tokenizer.src_lang = "en_XX"  # Source language (Banglish-like, treated as English script)
tokenizer.tgt_lang = "bn_IN"  # Target language (Bengali)

input_texts = [
    "ami",
    "valo"
]

# Tokenize inputs
tokenized_inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
tokenized_inputs = {key: val.to(device) for key, val in tokenized_inputs.items()}

# Generate predictions
outputs = model.generate(
    input_ids=tokenized_inputs["input_ids"],
    attention_mask=tokenized_inputs["attention_mask"],
    max_length=50,
    num_beams=5,
    forced_bos_token_id=tokenizer.lang_code_to_id["bn_IN"]  # Ensure output in Bengali
)

# Decode predictions
predicted_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print results
for i, (input_text, prediction) in enumerate(zip(input_texts, predicted_texts)):
    print(f"Input {i+1}: {input_text}")
    print(f"Prediction {i+1}: {prediction}")
    print("-" * 50)
