In [1]:
from datasets import load_dataset
from nltk.tokenize import word_tokenize
import re

# Load dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split dataset
train_test_split = dataset["train"].train_test_split(test_size=0.2)
train_data = train_test_split["train"]
val_data = train_test_split["test"]
print(train_data.column_names)

['bn', 'rm']


In [2]:
def clean_text(text, lang="banglish"):
    try:
        if lang == "banglish":
            text = re.sub(r"[^a-zA-Z0-9\s,.!?']", "", text)  # Clean non-Banglish characters
        elif lang == "bangla":
            text = re.sub(r"[^\u0980-\u09FF\s,.!?']", "", text)  # Clean non-Bangla characters
        return text.strip()
    except Exception as e:
        print(f"Error in clean_text: {e}")
        return ""  # Return empty string if an error occurs


def tokenize_text(text, lang="banglish"):
    try:
        if lang == "banglish":
            return word_tokenize(text)  # Tokenize Banglish using word_tokenize
        elif lang == "bangla":
            return text.split()  # Simple split for Bangla
        return text.split()  # Default behavior for unsupported languages
    except Exception as e:
        print(f"Error in tokenize_text: {e}")
        return []  # Return an empty list if an error occurs


def filter_sentence(sentence, min_len=3, max_len=50):
    word_count = len(sentence.split())
    return min_len <= word_count <= max_len


def preprocess_data(example, idx):
    try:
        if idx == 999:  # Skip the 999th row (index 999)
            return None

        # Clean and tokenize Banglish (column 'rm')
        example["rm"] = clean_text(example["rm"], lang="banglish")
        example["rm_tokens"] = tokenize_text(example["rm"], lang="banglish")
        
        # Clean and tokenize Bangla (column 'bn')
        example["bn"] = clean_text(example["bn"], lang="bangla")
        example["bn_tokens"] = tokenize_text(example["bn"], lang="bangla")
        
        # Filter based on sentence length
        if not filter_sentence(example["rm"]) or not filter_sentence(example["bn"]):
            return None  # Skip this example if it's too short or too long
        
        return example

    except Exception as e:
        print(f"Error processing example at index {idx}: {e}")
        return None  # Return None if an error occurs


# Assuming 'train_data' and 'val_data' are your datasets (from datasets library)

def process_dataset(dataset):
    try:
        # Apply preprocessing, skip examples where None is returned, and pass indices
        dataset = dataset.map(preprocess_data, remove_columns=dataset.column_names, with_indices=True, batched=False)
        
        # Filter out None values (if any) after mapping
        dataset = dataset.filter(lambda x: x is not None)

        return dataset

    except Exception as e:
        print(f"Error processing dataset: {e}")
        return dataset  # Return dataset in case of error to prevent failure

# Process the training and validation datasets
train_data = process_dataset(train_data)
val_data = process_dataset(val_data)

# Save preprocessed datasets
train_data.save_to_disk("preprocessed_train_data")
val_data.save_to_disk("preprocessed_val_data")

print("Preprocessing complete!")


Map:   0%|          | 0/4004 [00:00<?, ? examples/s]

Error processing dataset: 'NoneType' object is not subscriptable


Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

Error processing dataset: 'NoneType' object is not subscriptable


Saving the dataset (0/1 shards):   0%|          | 0/4004 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1002 [00:00<?, ? examples/s]

Preprocessing complete!


In [3]:
print(train_data[0])
print(val_data[0])

{'bn': 'আচ্ছা ভাইয়া চেষ্টা করবো', 'rm': 'Assa vaiya chesta korbo'}
{'bn': 'অ্যাপ এর নাম ঠিক করে লিখতে শিখেন না', 'rm': 'App er nam thik kore likhte shikhen na'}


In [6]:
from transformers import MBartForConditionalGeneration, MBartTokenizer

# Load mBART model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBartTokenizer.from_pretrained(model_name)
# Set the tokenizer's source and target languages (Banglish to Bangla)
tokenizer.src_lang = "en_XX"  # Assuming Banglish is similar to English in its Romanized form
tokenizer.tgt_lang = "bn_BD"  # Target language is Bangla

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
def tokenize_function(examples):
    # Tokenize both 'rm' (Banglish) and 'bn' (Bangla) columns
    inputs = tokenizer(examples["rm"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(examples["bn"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply tokenization to the train and validation datasets
train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)


Map:   0%|          | 0/4004 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",              # Where to save the model
    evaluation_strategy="epoch",         # Evaluate after each epoch
    learning_rate=5e-5,                  # Learning rate
    per_device_train_batch_size=8,       # Batch size for training
    per_device_eval_batch_size=8,        # Batch size for evaluation
    num_train_epochs=3,                  # Number of training epochs
    weight_decay=0.01,                   # Weight decay for regularization
    logging_dir="./logs",                # Where to save logs
    logging_steps=10,                    # Log every 10 steps
    save_steps=500,                      # Save checkpoint every 500 steps
    save_total_limit=2,                  # Keep only the last 2 checkpoints
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)

# Start fine-tuning
trainer.train()
