In [None]:
!pip install datasets

from datasets import load_dataset

# Load the Banglish-to-Bengali dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Show a sample of the dataset
print(dataset['train'][0])


In [8]:
from transformers import AutoTokenizer

# Choose a pre-trained tokenizer, for example, mBART
tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['rm'], examples['bn'], truncation=True, padding=True)

# Apply tokenization to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Filter out overly short or long sentences
tokenized_datasets = tokenized_datasets.filter(lambda x: len(x['input_ids']) > 5 and len(x['input_ids']) < 256)


Map:   0%|          | 0/5006 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5006 [00:00<?, ? examples/s]

In [11]:
# Use the existing 'train' split for training and validation
train_dataset = tokenized_datasets['train']


In [12]:
from transformers import AutoModelForSeq2SeqLM

# Load a pre-trained mBART model
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/m2m100_418M")


pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

In [15]:
# First, tokenize the dataset
tokenized_datasets = tokenized_datasets.map(lambda x: tokenizer(x['bn'], padding="max_length", truncation=True), batched=True)

# Split the dataset into training and validation (80/20 split)
split_dataset = tokenized_datasets['train'].train_test_split(test_size=0.2)

# Access the train and validation datasets
train_dataset = split_dataset['train']
valid_dataset = split_dataset['test']

# Check the columns of the train and validation datasets to ensure they're correct
print(train_dataset.column_names)  # Ensure it has 'input_ids', 'attention_mask'
print(valid_dataset.column_names)  # Ensure it has 'input_ids', 'attention_mask'


Map:   0%|          | 0/5006 [00:00<?, ? examples/s]

['bn', 'rm', 'input_ids', 'attention_mask']
['bn', 'rm', 'input_ids', 'attention_mask']


In [17]:
def preprocess_function(examples):
    # Tokenizing the source and target sequences
    inputs = tokenizer(examples['bn'], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(examples['rm'], padding="max_length", truncation=True, max_length=128)

    # Adding the target as labels (shifted input for decoder)
    inputs['labels'] = targets['input_ids']
    return inputs

# Apply the preprocessing to the dataset
tokenized_datasets = tokenized_datasets.map(preprocess_function, batched=True)

# Split the dataset into training and validation
split_dataset = tokenized_datasets['train'].train_test_split(test_size=0.2)
train_dataset = split_dataset['train']
valid_dataset = split_dataset['test']


Map:   0%|          | 0/5006 [00:00<?, ? examples/s]

In [19]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate every epoch
    learning_rate=2e-5,           # Learning rate
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,   # Batch size for evaluation
    num_train_epochs=3,             # Number of epochs
    weight_decay=0.01,              # Weight decay
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0461,0.089062
2,0.0306,0.08701
3,0.0295,0.085249


TrainOutput(global_step=1503, training_loss=0.03541647553126652, metrics={'train_runtime': 2230.6436, 'train_samples_per_second': 5.385, 'train_steps_per_second': 0.674, 'total_flos': 3253907560071168.0, 'train_loss': 0.03541647553126652, 'epoch': 3.0})

In [20]:
# Evaluate the model
trainer.evaluate(valid_dataset)


{'eval_loss': 0.08524927496910095,
 'eval_runtime': 35.6846,
 'eval_samples_per_second': 28.079,
 'eval_steps_per_second': 3.531,
 'epoch': 3.0}

In [21]:
trainer.save_model("./banglish_to_bengali_model")


In [25]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the saved model and tokenizer from the specified directory
model = AutoModelForSeq2SeqLM.from_pretrained('./banglish_to_bengali_model')  # Path to your saved model
tokenizer = AutoTokenizer.from_pretrained('./banglish_to_bengali_model')  # Path to the saved tokenizer

# Move the model to the GPU or CPU
model.to('cuda')  # If you want to use GPU, else use 'cpu' for CPU


M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): M2M100ScaledWordEmbedding(128112, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): M2M100ScaledWordEmbedding(128112, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-11): 12 x M2M100EncoderLayer(
          (self_attn): M2M100SdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
   

In [26]:
# Example Banglish input text
input_text = "Amar naam Iqbal"

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Move the input tensors to the same device as the model
inputs = {key: value.to('cuda') for key, value in inputs.items()}  # Move to GPU if using GPU

# Generate the translation (Bengali text)
output = model.generate(**inputs)

# Decode the output to Bengali text
translated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(f"Input: {input_text}")
print(f"Output: {translated_text}")


Input: Amar naam Iqbal
Output: Amar nam Iqbal


In [27]:
test_sentences = [
    "Amar naam Iqbal",
    "Ami bhalo achi",
    "Kemon achho?"
]

for sentence in test_sentences:
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: value.to('cuda') for key, value in inputs.items()}  # Move to GPU if using GPU
    output = model.generate(**inputs)
    translated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"Input: {sentence}")
    print(f"Output: {translated_text}")


Input: Amar naam Iqbal
Output: Amar nam Iqbal
Input: Ami bhalo achi
Output: Ami bhalo achi
Input: Kemon achho?
Output: Kemon achho?


In [31]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# Load the saved model and tokenizer from the specified directory
model = AutoModelForSeq2SeqLM.from_pretrained('./banglish_to_bengali_model')  # Path to your saved model
tokenizer = AutoTokenizer.from_pretrained('./banglish_to_bengali_model')  # Path to the saved tokenizer

# Move the model to the GPU or CPU
model.to('cuda')  # If you want to use GPU, else use 'cpu' for CPU

# Example test sentences
test_sentences = [
    "Ami bhalo achi",
    "Tumi kemon achho?",
    "Amar naam Iqbal",
    "Ami Bangladesh theke aschi",
    "Tumi kobe asbe?",
    "Ami chai bhalo result paite",
    "Ami chhobi tulte bhalobashi",
    "Tumi kothay jao?",
    "Ami boro ekta project korte jachhi",
    "Tumi ki amar sathe ajke berate chao?",
    "Ami jani tumi bhalo ache",
    "Ami tomar sathe chhobi tulbo",
    "Tumi kemon bhabe kaj kore?",
    "Ami tomar sathe chinta kore dekhi",
    "Ami chai je ami bhalo performance dekhabo",
    "Tomar boro bhai kothay ache?",
    "Ami tomar moner kotha bujhte parchi",
    "Ami ekta movie dekhbo ajke",
    "Tumi amar shathe shopping e chole asbe?",
    "Ami tomar kache kichu proshno ache",
    "Ami chhobi tulte bhalobashi",
    "Ami jani tumi amar kotha shunte bhalo basho",
    "Tumi jodi amar sathe thako tahole bhalo hobe",
    "Ami tomar sathe kisu kotha bolbo",
    "Tumi amar sathe bhalo thako"
]

# Translate each sentence
translated_sentences = []
for sentence in test_sentences:
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to('cuda')
    outputs = model.generate(**inputs)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    translated_sentences.append(translated_text)

# Print translated sentences
for input_sentence, translated_sentence in zip(test_sentences, translated_sentences):
    print(f"Input: {input_sentence}")
    print(f"Output: {translated_sentence}\n")


Input: Ami bhalo achi
Output: Ami bhalo achi

Input: Tumi kemon achho?
Output: Tumi kemon achho?

Input: Amar naam Iqbal
Output: Amar nam Iqbal

Input: Ami Bangladesh theke aschi
Output: Ami bangladesh theke aschi

Input: Tumi kobe asbe?
Output: Tumi kobe asbe?

Input: Ami chai bhalo result paite
Output: Ami chai bhalo result paite

Input: Ami chhobi tulte bhalobashi
Output: Ami chobi tule bhalobashi

Input: Tumi kothay jao?
Output: Tumi kothay jao?

Input: Ami boro ekta project korte jachhi
Output: Ami boro ekta project korte jachhi

Input: Tumi ki amar sathe ajke berate chao?
Output: Tumi ki amar sathe ajke berate chao?

Input: Ami jani tumi bhalo ache
Output: Ami jani tumi bhalo ache

Input: Ami tomar sathe chhobi tulbo
Output: Ami tobe sathe chobi tulbo

Input: Tumi kemon bhabe kaj kore?
Output: Tumi kemon bhabe kaj kore?

Input: Ami tomar sathe chinta kore dekhi
Output: Ami tobe sathe chinta kore dekhi

Input: Ami chai je ami bhalo performance dekhabo
Output: Ami chai je ami bhalo