In [1]:
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Inspect the dataset structure
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 5006
    })
})


In [2]:
train_test_split = dataset["train"].train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
valid_dataset = train_test_split["test"]

# Print the sizes of the splits
print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(valid_dataset)}")


Training set size: 4505
Validation set size: 501


In [3]:
from transformers import AutoTokenizer

# Load the tokenizer for mBART (multi-lingual model)
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")

# Test the tokenizer with a sample text
sample_text = "ami banglay gan gai"
tokenized = tokenizer(sample_text, return_tensors="pt")
print(tokenized)


{'input_ids': tensor([[  5263,  74126,     53,   1756,  15559,      2, 250004]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


In [9]:
# Set the target language
tokenizer.tgt_lang = "bn_IN"

def preprocess_function(examples):
    return tokenizer(
        examples["rm"],          # Romanized Banglish text
        text_target=examples["bn"],  # Bengali script
        truncation=True,         # Truncate to max length
        max_length=128           # Add a maximum token length
    )


In [10]:
train_tokenized = train_dataset.map(preprocess_function, batched=True)
valid_tokenized = valid_dataset.map(preprocess_function, batched=True)

# Check a sample from the tokenized dataset
print(train_tokenized[0])


Map:   0%|          | 0/4505 [00:00<?, ? examples/s]

Map:   0%|          | 0/501 [00:00<?, ? examples/s]

{'bn': 'আমি ট্রাই করবো।কাজ করলে ভালো।থ্যাংকস।', 'rm': 'Ami try korbo.kaj korle valo.Thanks.', 'input_ids': [25472, 9790, 4181, 837, 5, 20103, 4181, 133, 32391, 5, 188774, 7, 5, 2, 250004], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [21145, 125931, 35312, 172954, 9445, 125, 13504, 8276, 112592, 53997, 125, 19293, 14329, 130180, 3458, 125, 2, 3]}
