In [1]:
!pip install huggingface_hub
!pip install datasets
!pip install transformers
!pip install sacrebleu
!pip insall accelerate
!pip install evaluate

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.0.0 sacrebleu-2.4.3
ERROR: unknown command "insall" - maybe you meant "install"
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling colle

# **Necessary Imports**

In [2]:
import torch
from torch.utils.data import random_split
import datasets
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np
import gc
import torch
from transformers import AutoTokenizer, GenerationConfig
from transformers import (
    MBartForConditionalGeneration, MBartTokenizer, 
    Seq2SeqTrainingArguments, Seq2SeqTrainer
)
from datasets import load_dataset
import evaluate
from transformers import MBartForConditionalGeneration

# **Dataset Loading and Preprocessing**

In [3]:
ds = load_dataset("SKNahin/bengali-transliteration-data")

split_datasets = ds['train'].train_test_split(test_size=0.2)

train_dataset = split_datasets['train']
valid_dataset = split_datasets['test']

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(valid_dataset)}")

max_input_length = max([len(ex['bn']) for ex in train_dataset])
max_output_length = max([len(ex['rm']) for ex in train_dataset])

print(f"Maximum input length (Bengali): {max_input_length}")
print(f"Maximum output length (Romanized Bengali): {max_output_length}")

train_dataset[:5]

README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]

Training set size: 4004
Validation set size: 1002
Maximum input length (Bengali): 246
Maximum output length (Romanized Bengali): 253


{'bn': ['লিঙ্ক কাজ করে না ',
  'রুট করার কোনো ওয়ে আছে ? ',
  'হুম গুড পোস্ট ব্রো',
  'ভাই,,,,,,ইউটিউব ভিপিএন মডেড করার একটা ভিডিওস দেন,,,,,,',
  'নাম্বার কই মিয়া '],
 'rm': ['link kaj kore na',
  'root korar kono way ase ?',
  'Hmmm gd post bro',
  'Vai,,,,,,youtube vpn moded korar ekta videos den,,,,,,',
  'Number koi mia']}

# **Initializing the Model**

In [4]:
checkpoint = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = ["আগে থেকেই জানি। থ্যাংকস ফর শেয়ার।" , "আমার ফোন স্যামসাং"]
tokenized_output = tokenizer(sentences)


print(tokenized_output)


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]



{'input_ids': [[250004, 31636, 197562, 218577, 125, 6, 19293, 14329, 130180, 3458, 22741, 999, 61414, 125, 2], [250004, 29388, 59354, 12945, 14329, 4198, 16984, 17031, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [5]:
model = MBartForConditionalGeneration.from_pretrained(checkpoint)

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [6]:
max_input_length = 256
max_target_length = 256
source_lang = "rm"  
target_lang = "bn"  
tokenizer.src_lang = source_lang
tokenizer.tgt_lang = target_lang

def preprocess_function(examples):
    inputs = examples[source_lang]  
    targets = examples[target_lang]  


    model_inputs = tokenizer(
        inputs, max_length=max_input_length, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        targets, max_length=max_target_length, truncation=True, padding="max_length"
    )


    sanitized_input_ids = [[id for id in seq if id >= 0] for seq in model_inputs["input_ids"]]
    sanitized_labels = [[id for id in seq if id >= 0] for seq in labels["input_ids"]]

    model_inputs["input_ids"] = sanitized_input_ids
    model_inputs["labels"] = sanitized_labels

    return model_inputs



In [7]:
tokenized_train_set = train_dataset.map(preprocess_function, batched=True)
tokenized_valid_set = valid_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4004 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

metric = evaluate.load("sacrebleu")

print("Sample tokenized example:")
print(tokenized_train_set[0])

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Sample tokenized example:
{'bn': 'লিঙ্ক কাজ করে না ', 'rm': 'link kaj kore na', 'input_ids': [3, 3126, 719, 20867, 24, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

# **Train the Model**

In [9]:
!pip install --upgrade transformers accelerate

  pid, fd = os.forkpty()


Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.47.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0mm
[?25hDownloading accelerate-1.2.1-py3-none-any.whl (336 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.4/336.4 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [10]:
import torch

if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    print(f"GPU Available: {device_name}")
else:
    print("No GPU available.")

GPU Available: Tesla P100-PCIE-16GB


In [11]:
from accelerate import Accelerator


accelerator = Accelerator()

model, tokenizer, data_collator = accelerator.prepare(model, tokenizer, data_collator)
model.config.max_length = 200
model.config.early_stopping = True
model.config.num_beams = 5
model.config.forced_eos_token_id = 2

In [12]:
import torch
from tqdm.auto import tqdm
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"



training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/results",
    run_name="BanglishBartSiuu",
    eval_strategy="epoch",
    logging_dir="/kaggle/working/logs",
    logging_steps=50,
    learning_rate=2e-5,
    per_device_train_batch_size=2,  
    per_device_eval_batch_size=2,  
    weight_decay=0.01,
    num_train_epochs=1, 
    predict_with_generate=True,
    save_steps=500,
    save_total_limit=3,
    fp16=True,
    report_to="none",
    disable_tqdm=False,
    gradient_checkpointing=True,  
)


training_args.gradient_accumulation_steps = 2


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_set,
    eval_dataset=tokenized_valid_set,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


# Manually clear CUDA cache
torch.cuda.empty_cache()

# Optionally, clear cache during training
trainer.train()

# Clear cache after training
torch.cuda.empty_cache()


  ):
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


RuntimeError: [enforce fail at inline_container.cc:603] . unexpected pos 3753363712 vs 3753363600