# Let's start model building

In [1]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [41]:
dataset = load_dataset("SKNahin/bengali-transliteration-data")

In [42]:
dataset = dataset["train"].train_test_split(test_size=0.1)

In [43]:
print("Training examples:", len(dataset["train"]))
print("Validation examples:", len(dataset["test"]))

Training examples: 4505
Validation examples: 501


In [44]:
print("Columns:", dataset["train"].column_names)

# Print the first row of data for the train dataset
print("First row:", {col: dataset["train"][col][0] for col in dataset["train"].column_names[:5]})


Columns: ['bn', 'rm']
First row: {'bn': 'আপনার এফবি আইডি নেম বিশাল আর এইখানে মামুন কেন ?', 'rm': 'Apnar fb id name Bishal ar Ekhane mamun keno ?'}


In [7]:
print("All columns:", dataset["train"].column_names)

All columns: ['bn', 'rm']


In [45]:
from transformers import AutoTokenizer

# Load tokenizer for the selected model (e.g., T5)
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")



In [47]:
def preprocess_function(examples):
    inputs = examples["rm"]
    targets = examples["bn"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

# Apply tokenization
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [26]:
print("Before Tokenization:")
print(dataset["train"][0])  # First sample of the training dataset

# Apply tokenization
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Print a few samples after tokenization
print("\nAfter Tokenization:")
print(tokenized_dataset["train"][0])  # First sample of the tokenized training dataset

Before Tokenization:
{'bn': 'আপনার এফবি আইডি নেম বিশাল আর এইখানে মামুন কেন ?', 'rm': 'Apnar fb id name Bishal ar Ekhane mamun keno ?'}


Map:   0%|          | 0/4505 [00:00<?, ? examples/s]


After Tokenization:
{'bn': 'আপনার এফবি আইডি নেম বিশাল আর এইখানে মামুন কেন ?', 'rm': 'Apnar fb id name Bishal ar Ekhane mamun keno ?', 'input_ids': [6220, 4480, 49895, 259, 525, 6535, 154205, 473, 798, 415, 182138, 6356, 604, 513, 505, 259, 291, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [10045

In [27]:
def filter_function(example):
    return 5 <= len(example["rm"]) <= 100 and 5 <= len(example["bn"]) <= 100

filtered_dataset = tokenized_dataset.filter(filter_function)


Filter:   0%|          | 0/4505 [00:00<?, ? examples/s]

Filter:   0%|          | 0/501 [00:00<?, ? examples/s]

In [28]:
# Check the size of the dataset before filtering
print("Before Filtering:")
print(f"Training examples: {len(tokenized_dataset['train'])}")
print(f"Validation examples: {len(tokenized_dataset['test'])}")
print(f"First example: {tokenized_dataset['train'][0]}")

# Apply the filter
filtered_dataset = tokenized_dataset.filter(filter_function)

# Check the size of the dataset after filtering
print("\nAfter Filtering:")
print(f"Training examples: {len(filtered_dataset['train'])}")
print(f"Validation examples: {len(filtered_dataset['test'])}")
print(f"First example: {filtered_dataset['train'][0]}")

Before Filtering:
Training examples: 4505
Validation examples: 501
First example: {'bn': 'আপনার এফবি আইডি নেম বিশাল আর এইখানে মামুন কেন ?', 'rm': 'Apnar fb id name Bishal ar Ekhane mamun keno ?', 'input_ids': [6220, 4480, 49895, 259, 525, 6535, 154205, 473, 798, 415, 182138, 6356, 604, 513, 505, 259, 291, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [29]:
from transformers import MT5ForConditionalGeneration

# Load the mT5 model
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")


In [14]:
from transformers import MT5ForConditionalGeneration

# Load the mT5 model
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")

In [30]:
import os
import torch
from transformers import MT5ForConditionalGeneration, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Suppress tokenizer parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Use MPS for Metal on Mac M1 or fallback to CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

dataset = load_dataset("SKNahin/bengali-transliteration-data")
dataset = dataset["train"].train_test_split(test_size=0.1)

train_subset = dataset["train"]
test_subset = dataset["test"]

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small").to(device)

# Preprocess the dataset
def preprocess_function(examples):
    inputs = tokenizer(examples["rm"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["bn"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_train = train_subset.map(preprocess_function, batched=True)
tokenized_test = test_subset.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save model checkpoints
    evaluation_strategy="epoch",    # Evaluate at the end of each epoch
    learning_rate=5e-5,             # Standard learning rate for fine-tuning
    per_device_train_batch_size=4,  # Smaller batch size for M1 memory constraints
    per_device_eval_batch_size=4,
    num_train_epochs=1,             # Reduced epochs for faster training
    save_strategy="epoch",
    logging_dir="./logs",
    fp16=False,                     # Mixed precision not supported on MPS
    optim="adamw_torch",
    report_to="none"
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./banglish-to-bengali-model")
tokenizer.save_pretrained("./banglish-to-bengali-model")
print("Model training complete and saved.")

Using device: cpu


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,61.321918


Model training complete and saved.


In [32]:
!pip install huggingface_hub



In [33]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [35]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load your saved model and tokenizer
model = MT5ForConditionalGeneration.from_pretrained("./banglish-to-bengali-model")
tokenizer = AutoTokenizer.from_pretrained("./banglish-to-bengali-model")

# Push to Hugging Face Hub
model.push_to_hub("Shifa1301/banglish-to-bengali-model")
tokenizer.push_to_hub("Shifa1301/banglish-to-bengali-model")

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Shifa1301/banglish-to-bengali-model/commit/392ef8c4d8d1359364c70d7453ad6b9c034eb1cc', commit_message='Upload tokenizer', commit_description='', oid='392ef8c4d8d1359364c70d7453ad6b9c034eb1cc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Shifa1301/banglish-to-bengali-model', endpoint='https://huggingface.co', repo_type='model', repo_id='Shifa1301/banglish-to-bengali-model'), pr_revision=None, pr_num=None)

In [40]:
from google.colab import files
import zipfile

def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(os.path.join(root, file))

import os
if not os.path.exists("./banglish-to-bengali-model"):
    print("Directory ./banglish-to-bengali-model. Please run the training code first.")
else:
  zipf = zipfile.ZipFile('banglish-to-bengali-model.zip', 'w', zipfile.ZIP_DEFLATED)
  zipdir('./banglish-to-bengali-model', zipf)
  zipf.close()
  files.download('banglish-to-bengali-model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>