# Import the necessary packages

In [12]:
#!pip install sacremoses

In [2]:
import torch
from torch import nn
from transformers import AutoModel
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSeq2SeqLM
from nltk.translate.bleu_score import corpus_bleu
import random
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Load the dataset

In [14]:
dataset = load_dataset("cfilt/iitb-english-hindi")
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

# Load the Tokenizar and Pre trained Model

[opus-mt-en-hi](https://huggingface.co/Helsinki-NLP/opus-mt-en-hi)

In [15]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# TrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, EarlyStoppingCallback

model_name = "Helsinki-NLP/opus-mt-hi-en"
Tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



In [16]:
# Define the function for translation
def translate(texts, max_length=512):
    # Tokenize the input texts
    inputs = Tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)

    # Generate the translations
    outputs = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask)

    # Decode the translations
    translations = [Tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

    return translations

In [17]:
# Get the total number of rows in the 'train' dataset
num_rows = dataset['train'].num_rows

# Get 5 random unique indices
index = random.sample(range(num_rows), 3)

# Extract the translations for the randomly selected indices
english_translations = [dataset['train']['translation'][i]['en'] for i in index]
hindi_translations = [dataset['train']['translation'][i]['hi'] for i in index]

In [18]:
# Hindi To English
Hindi_English = translate(hindi_translations)

for Hindi, English, translated in zip(hindi_translations,english_translations, Hindi_English):
    print(f"Hindi: {Hindi}")
    print(f"English: {English}")
    print(f"Translated: {translated}")
    # Calculate BLEU score
    bleu_score=corpus_bleu([English], [translated])
    print(f"BLEU score: {bleu_score:.2f}\n")

bleu_score=corpus_bleu(english_translations,Hindi_English)
print(f"BLEU Score:{bleu_score}")

Hindi: योजना में शेष 1329 करोड़ रु. का राज्य/संघ राज्यय क्षेत्र का हिस्सा है जिसका प्रावधान व्यय विभाग, भारत सरकार द्वारा अतिरिक्त केंद्रीय सहायता (ए सी ए) के तहत किया गया है। 
English: The Scheme has a State / UT share of balance Rs. 1329 crores, which has been provisioned by the Department of Expenditure, Govt. of India, under Additional Central Assistance (ACA).
Translated: In the plan is a part of the state of 1329 million R.C., which has been under the Department of Exploitation, India's government, beyond central aid (AB) by India.
BLEU score: 0.00

Hindi: तो तुम अपने परवरदिगार की तारीफ़ के साथ तसबीह करना और उसी से मग़फेरत की दुआ माँगना वह बेशक बड़ा माफ़ करने वाला है
English: Celebrate the praises of thy Lord, and pray for His Forgiveness: For He is Oft - Returning (in Grace and Mercy).
Translated: Then hallow the praise of thy Lord, and ask forgiveness of Him. Verily He is ever relenting.
BLEU score: 0.00

Hindi: ये वही लोग तो है जिन्होंने इनकार किया और तुम्हें मस्जिदे हराम (काब

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


### Let's Finetune the model for better accuracy.

# Preprocessing

In [19]:
#dataset["train"]["translation"]['en']
En = dataset['train']["translation"][0]['en']
Hn = dataset['train']["translation"][0]['hi']

print(En)
Hindi = Tokenizer(En)
print(Hindi)
print('\n',Hn)
English = Tokenizer(Hn)
print(English)

Give your application an accessibility workout
{'input_ids': [2476, 3559, 78, 4315, 138, 50, 35623, 1420, 23, 654, 40280, 5991, 333, 4373, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

 अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
{'input_ids': [62, 4414, 21, 8765, 13268, 488, 22086, 30, 2036, 824, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


### Convert the text data to vector using bytes-pair encoding

In [20]:
# Convert Hindi sentence to bytes and tokenize into individual tokens
Hindi_encoded_sentence = bytes(Hn, 'utf-8')
hindi_tokens = bytearray(Hindi_encoded_sentence)

# Convert Hindi tokens to a NumPy array
hindi_vector = torch.tensor(hindi_tokens)

# Print the vectors
print('Hindi :',Hindi_encoded_sentence)
print('Hindi tokens:',hindi_tokens)
print('Hindi vector:', hindi_vector)

# Convert English sentence to bytes and tokenize into individual tokens
English_encoded_sentence = bytes(En, 'utf-8')
print(English_encoded_sentence)
english_tokens = bytearray(English_encoded_sentence)

# Convert English tokens to a NumPy array
english_vector = torch.tensor(english_tokens)

# English
print('English :',English_encoded_sentence)
print('English Tokens:', english_tokens)
print('English vector:', english_vector)

Hindi : b'\xe0\xa4\x85\xe0\xa4\xaa\xe0\xa4\xa8\xe0\xa5\x87 \xe0\xa4\x85\xe0\xa4\xa8\xe0\xa5\x81\xe0\xa4\xaa\xe0\xa5\x8d\xe0\xa4\xb0\xe0\xa4\xaf\xe0\xa5\x8b\xe0\xa4\x97 \xe0\xa4\x95\xe0\xa5\x8b \xe0\xa4\xaa\xe0\xa4\xb9\xe0\xa5\x81\xe0\xa4\x82\xe0\xa4\x9a\xe0\xa4\xa8\xe0\xa5\x80\xe0\xa4\xaf\xe0\xa4\xa4\xe0\xa4\xbe \xe0\xa4\xb5\xe0\xa5\x8d\xe0\xa4\xaf\xe0\xa4\xbe\xe0\xa4\xaf\xe0\xa4\xbe\xe0\xa4\xae \xe0\xa4\x95\xe0\xa4\xbe \xe0\xa4\xb2\xe0\xa4\xbe\xe0\xa4\xad \xe0\xa4\xa6\xe0\xa5\x87\xe0\xa4\x82'
Hindi tokens: bytearray(b'\xe0\xa4\x85\xe0\xa4\xaa\xe0\xa4\xa8\xe0\xa5\x87 \xe0\xa4\x85\xe0\xa4\xa8\xe0\xa5\x81\xe0\xa4\xaa\xe0\xa5\x8d\xe0\xa4\xb0\xe0\xa4\xaf\xe0\xa5\x8b\xe0\xa4\x97 \xe0\xa4\x95\xe0\xa5\x8b \xe0\xa4\xaa\xe0\xa4\xb9\xe0\xa5\x81\xe0\xa4\x82\xe0\xa4\x9a\xe0\xa4\xa8\xe0\xa5\x80\xe0\xa4\xaf\xe0\xa4\xa4\xe0\xa4\xbe \xe0\xa4\xb5\xe0\xa5\x8d\xe0\xa4\xaf\xe0\xa4\xbe\xe0\xa4\xaf\xe0\xa4\xbe\xe0\xa4\xae \xe0\xa4\x95\xe0\xa4\xbe \xe0\xa4\xb2\xe0\xa4\xbe\xe0\xa4\xad \xe0\xa4\xa6\xe0\xa5\x87

In [21]:
# Define custom dataset class for machine translation
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get input and target sentences
        input_text = self.data[idx]['hi']
        target_text = self.data[idx]['en']

        # Tokenize input and target sentences
        input_tokens = Tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        target_tokens = Tokenizer(target_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

        # Define decoder input tokens by removing the last token from the target sequence
        decoder_input_tokens = {'input_ids': target_tokens['input_ids'][:, :-1].squeeze(),
                                'attention_mask': target_tokens['attention_mask'][:, :-1].squeeze()}

        # Define labels as the original target sequence, shifted by one position
        labels = {'input_ids': target_tokens['input_ids'][:, 1:].squeeze(),
                  'attention_mask': target_tokens['attention_mask'][:, 1:].squeeze()}

        # Return input, decoder input, and label tokens as PyTorch tensors
        return {'input_ids': input_tokens['input_ids'].squeeze(),
                'attention_mask': input_tokens['attention_mask'].squeeze(),
                'decoder_input_ids': decoder_input_tokens['input_ids'],
                'decoder_attention_mask': decoder_input_tokens['attention_mask'],
                'labels': labels['input_ids']}


# Load training data
train_dataset = TranslationDataset(dataset['train']['translation'])
valid_dataset = TranslationDataset(dataset['validation']['translation'])
test_dataset  = TranslationDataset(dataset['test']['translation'])

train = next(iter(train_dataset))
train

{'input_ids': tensor([   62,  4414,    21,  8765, 13268,   488, 22086,    30,  2036,   824,
             0, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61

In [22]:
train.keys()

dict_keys(['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels'])

### Check GPU available momory

In [23]:
! nvidia-smi

zsh:1: command not found: nvidia-smi


# Train the model
### Define loss function and optimizer for the model

In [8]:
# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Move model to GPU (if available)
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(device)
model = model.to(device)
model


mps


MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(61127, 512, padding_idx=61126)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(61127, 512, padding_idx=61126)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [26]:
# Define dataloaders for training and validation data
train_dataloader = DataLoader(train_dataset, batch_size=23, shuffle=True)
validation_dataloader = DataLoader(valid_dataset, batch_size=23, shuffle=True)
num_epochs = 1

for epoch in range(num_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    # Training loop
    model.train()
    for batch_idx, batch in enumerate(tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}/{num_epochs}")):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        decoder_input_ids = batch["decoder_input_ids"].to(device)
        decoder_attention_mask = batch["decoder_attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        # Save the model every 100 batches
        if batch_idx % 100 == 0:
            torch.save(model.state_dict(), f"model_epoch_{epoch+1}_batch_{batch_idx}.pth")


    # Free up unused memory on the GPU
    torch.mps.empty_cache()

    # Validation loop
    model.eval()
    with torch.no_grad():
        for batch in tqdm(validation_dataloader, desc=f"Validation Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            decoder_input_ids = batch["decoder_input_ids"].to(device)
            decoder_attention_mask = batch["decoder_attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs =model(input_ids, attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, labels=labels)
            loss = outputs.loss
            valid_loss += loss.item()

    # Calculate average loss for training and validation sets
    train_loss = train_loss / len(train_dataloader)
    valid_loss = valid_loss / len(validation_dataloader)

    # Print loss for the current epoch
    print(f"Epoch {epoch+1} Train Loss: {train_loss:.3f} | Valid Loss: {valid_loss:.3f}")

Training Epoch 1/1: 100%|██████████| 72135/72135 [22:03:38<00:00,  1.10s/it]        
Validation Epoch 1/1: 100%|██████████| 23/23 [00:05<00:00,  4.48it/s]

Epoch 1 Train Loss: 0.498 | Valid Loss: 0.585





In [None]:
# Define dataloaders for training and validation data
train_dataloader = DataLoader(train_dataset, batch_size=23, shuffle=True)
validation_dataloader = DataLoader(valid_dataset, batch_size=23, shuffle=True)
num_epochs = 1

# Initialize the starting epoch and batch index
start_epoch = 0
start_batch_idx = 0

# Load the saved state if available
if os.path.exists("saved_state.pth"):
    saved_state = torch.load("saved_state.pth")
    model.load_state_dict(saved_state["model_state_dict"])
    optimizer.load_state_dict(saved_state["optimizer_state_dict"])
    start_epoch = saved_state["epoch"]
    start_batch_idx = saved_state["batch_idx"]

for epoch in range(start_epoch, num_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    # Training loop
    model.train()
    for batch_idx, batch in enumerate(tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}/{num_epochs}"), start=start_batch_idx):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        decoder_input_ids = batch["decoder_input_ids"].to(device)
        decoder_attention_mask = batch["decoder_attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        # Save the model and optimizer states every 100 batches
        if batch_idx % 100 == 0:
            saved_state = {
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "epoch": epoch,
                "batch_idx": batch_idx
            }
            torch.save(saved_state, "saved_state.pth")

        # Pause training if desired
        if batch_idx == 500:  # pause training at batch index 500
            print("Pausing training...")
            break

    # Free up unused memory on the GPU
    torch.mps.empty_cache()

    # Resume training from the last saved state
    if batch_idx == 500:
        print("Resuming training...")
        start_batch_idx = batch_idx + 1
        continue

    # Validation loop
    model.eval()
    with torch.no_grad():
        for batch in tqdm(validation_dataloader, desc=f"Validation Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            decoder_input_ids = batch["decoder_input_ids"].to(device)
            decoder_attention_mask = batch["decoder_attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, labels=labels)
            loss = outputs.loss
            valid_loss += loss.item()

    # Calculate average loss for training and validation sets
    train_loss = train_loss / len(train_dataloader)
    valid_loss = valid_loss / len(validation_dataloader)

    # Print loss for the current epoch
    print(f"Epoch {epoch+1} Train Loss: {train_loss:.3f} | Valid Loss: {valid_loss:.3f}")

# Translation

In [11]:
def generate_translation(model, input_text):
    input_tokens = Tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt').to(device)
    input_ids = input_tokens['input_ids'].to(device).to(device)
    attention_mask = input_tokens['attention_mask'].to(device).to(device)

    # Generate translation using the model
    generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128, num_beams=4, early_stopping=True)

    # Decode the generated ids and return the translation
    translation = Tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return translation

generate_translation(model, 'मेरा नाम श्रेयांश जयसवाल है, मैं क्राइस्ट यूनिवर्सिटी में पढ़ता हूं और मेरा रोल नंबर 2348558 और क्लास MSAIML है')

'My name is Gandhiswal, I read at Christian University and my Roll No. 234558 and the class is MSAIML'

# Evaluations

In [28]:
# Define dataloader for test data
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Generate translations for the test set
generated_translations = []
for batch in test_dataloader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    generated = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128, num_beams=4, early_stopping=True)
    generated_translations.extend([Tokenizer.decode(g, skip_special_tokens=True) for g in generated])

# Get reference translations for the test set
reference_translations = [[d['hi']] for d in dataset['test']['translation']]

# Calculate BLEU score
bleu_score = corpus_bleu(reference_translations, generated_translations)
print(f"BLEU score: {bleu_score:.2f}")

BLEU score: 0.02


In [30]:
model.save_pretrained('opus-mt-hi-en-model')
Tokenizer.save_pretrained('opus-mt-hi-en-tokenizer')

Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}


('opus-mt-hi-en-tokenizer/tokenizer_config.json',
 'opus-mt-hi-en-tokenizer/special_tokens_map.json',
 'opus-mt-hi-en-tokenizer/vocab.json',
 'opus-mt-hi-en-tokenizer/source.spm',
 'opus-mt-hi-en-tokenizer/target.spm',
 'opus-mt-hi-en-tokenizer/added_tokens.json')

# Inference


In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

def Hindi_to_English(model, input_text):
    model = AutoModelForSeq2SeqLM.from_pretrained('opus-mt-hi-en-model')
    tokenizer = AutoTokenizer.from_pretrained('opus-mt-hi-en-tokenizer')
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)
    
    # Move model and input tensors to MPS device
    device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')    
    model.to(device)
    inputs = inputs.to(device)
    
    outputs = model.generate(**inputs)
    translation = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return translation



In [3]:
Input = input("Enter your Hindi text: ")
print(f"Translation: {Hindi_to_English(model, Input)}")

Translation: ['Hello, how are you?']
