In [16]:
from datasets import load_dataset

# Load the dataset
dataset_name = "SKNahin/bengali-transliteration-data"
dataset = load_dataset(dataset_name)

# Split the dataset into training and validation subsets (80/20 split) using Hugging Face's built-in method
dataset_split = dataset['train'].train_test_split(test_size=0.2, seed=42)

# Extract the training and validation sets
train_data = dataset_split['train']
val_data = dataset_split['test']

# Check the split
print(f"Training data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")


Training data size: 4004
Validation data size: 1002


In [27]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [28]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")

# Function to tokenize the data
def tokenize_function(examples):
    # Tokenize both Banglish and Bengali text (you may adjust the column names as per your dataset)
    inputs = tokenizer(examples['bn'], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(examples['rm'], padding="max_length", truncation=True, max_length=128)
    
    # Add the tokenized inputs and targets to the dataset
    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': targets['input_ids']  # For seq2seq tasks, labels should be the target sequence
    }

# Apply the tokenization to the train and validation sets
train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

# Check the tokenized data
print(train_data[3])
print(val_data[0])


Map:   0%|          | 0/4004 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

{'bn': 'ভাই...আর ২৪ আওয়ার ওয়েট করেন..আমার এফবি আইডি ব্যাক আসবে আমি রিকুয়েস্ট পাঠায়া দিছি...', 'rm': 'vai…ar 24 hour wait koren..amar fb id back ashbe ami request pathaia disi…', 'input_ids': [3, 2, 233, 2, 3, 2, 3, 2, 3, 2, 3, 2, 5, 5, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 233, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [409,

In [29]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [30]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
from evaluate import load

# Load the BLEU metric
metric = load("sacrebleu")

def compute_metrics(pred):
    """
    Compute metrics for the Seq2Seq task.
    Args:
        pred: The predictions from the trainer, containing logits and labels.
    Returns:
        dict: A dictionary of computed metrics.
    """
    # Decode predictions and references
    predictions = pred.predictions
    labels = pred.label_ids

    # Replace -100 in labels as the tokenizer pads with -100
    labels = [[l for l in label if l != -100] for label in labels]
    
    # Convert token IDs to strings
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute the metric
    result = metric.compute(predictions=decoded_preds, references=[[ref] for ref in decoded_labels])

    # Return the BLEU score (or other metrics if needed)
    return {"bleu": result["score"]}


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="dhongi",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [77]:
translator = pipeline("translation_inputs_to_targets", model=checkpoint)

In [81]:
banglish_text = "আমি টেস্ট করেই কোড দিছি…"

# Perform translation
translated_text = translator(banglish_text)

In [88]:
print("translated_text")

