# Import the necessary packages

In [1]:
import torch
from torch import nn
from transformers import AutoModel
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSeq2SeqLM
from nltk.translate.bleu_score import corpus_bleu
import random
from tqdm import tqdm



# Load the dataset

In [2]:
dataset = load_dataset("cfilt/iitb-english-hindi")
dataset

Downloading:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading and preparing dataset json/default (download: 181.38 MiB, generated: 427.93 MiB, post-processed: Unknown size, total: 609.31 MiB) to /root/.cache/huggingface/datasets/parquet/cfilt--iitb-english-hindi-2cfae92395f2614b/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/190M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.7k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/cfilt--iitb-english-hindi-2cfae92395f2614b/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
})

# Load the Tokenizar and Pre trained Model

[opus-mt-en-hi](https://huggingface.co/Helsinki-NLP/opus-mt-en-hi)

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# TrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, EarlyStoppingCallback

model_name = "Helsinki-NLP/opus-mt-hi-en"
Tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading source.spm:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/304M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [4]:
# Define the function for translation
def translate(texts, max_length=512):
    # Tokenize the input texts
    inputs = Tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    
    # Generate the translations
    outputs = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask)
    
    # Decode the translations
    translations = [Tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    
    return translations

In [5]:
# Get the total number of rows in the 'train' dataset
num_rows = dataset['train'].num_rows

# Get 5 random unique indices
index = random.sample(range(num_rows), 3)

# Extract the translations for the randomly selected indices
english_translations = [dataset['train']['translation'][i]['en'] for i in index]
hindi_translations = [dataset['train']['translation'][i]['hi'] for i in index]

In [6]:
# Hindi To English
Hindi_English = translate(hindi_translations)

for Hindi, English, translated in zip(hindi_translations,english_translations, Hindi_English):
    print(f"Hindi: {Hindi}")
    print(f"English: {English}")
    print(f"Translated: {translated}")
    # Calculate BLEU score
    bleu_score=corpus_bleu([English], [translated])
    print(f"BLEU score: {bleu_score:.2f}\n")

bleu_score=corpus_bleu(english_translations,Hindi_English)
print(f"BLEU Score:{bleu_score}")



Hindi: और तुम्हारे परवरदिगार की तरफ से तुम्हारे पास जो '' वही '' की जाती है (बस) उसी की पैरवी करो तुम लोग जो कुछ कर रहे हो खुदा उससे यक़ीनी अच्छा तरह आगाह है। 
English: And follow that which is revealed to you from your Lord. Indeed Allah is ever, with what you do, Acquainted.
Translated: And follow what is revealed to you from your Lord. Indeed, Allah is Acquainted with what you do.
BLEU score: 0.71

Hindi: किले के अंदर दो कमरों को गिरजाघरों में तब्दील कर दिया गया था, एक एंगलीकन धर्मोपासना के लिए और दूसरा रोमन कैथोलिक उपासना के लिए। 
English: Two rooms inside the fort had been converted into chapels, one for Anglican worship, the other for Roman Catholic.
Translated: Two rooms inside the castle had been transformed into rooms, one for the Andrenasa and the other for Roman Catholic worship.
BLEU score: 0.66

Hindi: Anjuta घर पेज (_ H) 
English: Anjuta _ Home Page
Translated: Anjuta _Home Page
BLEU score: 0.95

BLEU Score:0.7180390288467947


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


### Let's Finetune the model for better accuracy.

# Preprocessing

In [7]:
#dataset["train"]["translation"]['en']
En = dataset['train']["translation"][0]['en']
Hn = dataset['train']["translation"][0]['hi']

print(En)
Hindi = Tokenizer(En)
print(Hindi)
print('\n',Hn)
English = Tokenizer(Hn)
print(English)

Give your application an accessibility workout
{'input_ids': [2476, 3559, 78, 4315, 138, 50, 35623, 1420, 23, 654, 40280, 5991, 333, 4373, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

 अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
{'input_ids': [62, 4414, 21, 8765, 13268, 488, 22086, 30, 2036, 824, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


### Convert the text data to vector using bytes-pair encoding

In [8]:
# Convert Hindi sentence to bytes and tokenize into individual tokens
Hindi_encoded_sentence = bytes(Hn, 'utf-8')
hindi_tokens = bytearray(Hindi_encoded_sentence)

# Convert Hindi tokens to a NumPy array
hindi_vector = torch.tensor(hindi_tokens)

# Print the vectors
print('Hindi :',Hindi_encoded_sentence)
print('Hindi tokens:',hindi_tokens)
print('Hindi vector:', hindi_vector)

# Convert English sentence to bytes and tokenize into individual tokens
English_encoded_sentence = bytes(En, 'utf-8')
print(English_encoded_sentence)
english_tokens = bytearray(English_encoded_sentence)

# Convert English tokens to a NumPy array
english_vector = torch.tensor(english_tokens)

# English
print('English :',English_encoded_sentence)
print('English Tokens:', english_tokens)
print('English vector:', english_vector)

Hindi : b'\xe0\xa4\x85\xe0\xa4\xaa\xe0\xa4\xa8\xe0\xa5\x87 \xe0\xa4\x85\xe0\xa4\xa8\xe0\xa5\x81\xe0\xa4\xaa\xe0\xa5\x8d\xe0\xa4\xb0\xe0\xa4\xaf\xe0\xa5\x8b\xe0\xa4\x97 \xe0\xa4\x95\xe0\xa5\x8b \xe0\xa4\xaa\xe0\xa4\xb9\xe0\xa5\x81\xe0\xa4\x82\xe0\xa4\x9a\xe0\xa4\xa8\xe0\xa5\x80\xe0\xa4\xaf\xe0\xa4\xa4\xe0\xa4\xbe \xe0\xa4\xb5\xe0\xa5\x8d\xe0\xa4\xaf\xe0\xa4\xbe\xe0\xa4\xaf\xe0\xa4\xbe\xe0\xa4\xae \xe0\xa4\x95\xe0\xa4\xbe \xe0\xa4\xb2\xe0\xa4\xbe\xe0\xa4\xad \xe0\xa4\xa6\xe0\xa5\x87\xe0\xa4\x82'
Hindi tokens: bytearray(b'\xe0\xa4\x85\xe0\xa4\xaa\xe0\xa4\xa8\xe0\xa5\x87 \xe0\xa4\x85\xe0\xa4\xa8\xe0\xa5\x81\xe0\xa4\xaa\xe0\xa5\x8d\xe0\xa4\xb0\xe0\xa4\xaf\xe0\xa5\x8b\xe0\xa4\x97 \xe0\xa4\x95\xe0\xa5\x8b \xe0\xa4\xaa\xe0\xa4\xb9\xe0\xa5\x81\xe0\xa4\x82\xe0\xa4\x9a\xe0\xa4\xa8\xe0\xa5\x80\xe0\xa4\xaf\xe0\xa4\xa4\xe0\xa4\xbe \xe0\xa4\xb5\xe0\xa5\x8d\xe0\xa4\xaf\xe0\xa4\xbe\xe0\xa4\xaf\xe0\xa4\xbe\xe0\xa4\xae \xe0\xa4\x95\xe0\xa4\xbe \xe0\xa4\xb2\xe0\xa4\xbe\xe0\xa4\xad \xe0\xa4\xa6\xe0\xa5\x87

In [9]:
# Define custom dataset class for machine translation
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # Get input and target sentences
        input_text = self.data[idx]['hi']
        target_text = self.data[idx]['en']
        
        # Tokenize input and target sentences
        input_tokens = Tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        target_tokens = Tokenizer(target_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        
        # Define decoder input tokens by removing the last token from the target sequence
        decoder_input_tokens = {'input_ids': target_tokens['input_ids'][:, :-1].squeeze(),
                                'attention_mask': target_tokens['attention_mask'][:, :-1].squeeze()}
        
        # Define labels as the original target sequence, shifted by one position
        labels = {'input_ids': target_tokens['input_ids'][:, 1:].squeeze(),
                  'attention_mask': target_tokens['attention_mask'][:, 1:].squeeze()}
        
        # Return input, decoder input, and label tokens as PyTorch tensors
        return {'input_ids': input_tokens['input_ids'].squeeze(),
                'attention_mask': input_tokens['attention_mask'].squeeze(),
                'decoder_input_ids': decoder_input_tokens['input_ids'],
                'decoder_attention_mask': decoder_input_tokens['attention_mask'],
                'labels': labels['input_ids']}


# Load training data
train_dataset = TranslationDataset(dataset['train']['translation'])
valid_dataset = TranslationDataset(dataset['validation']['translation'])
test_dataset  = TranslationDataset(dataset['test']['translation'])

train = next(iter(train_dataset))
train

{'input_ids': tensor([   62,  4414,    21,  8765, 13268,   488, 22086,    30,  2036,   824,
             0, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126,
         61126, 61126, 61126, 61126, 61

In [10]:
train.keys()

dict_keys(['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels'])

### Check GPU available momory

In [11]:
! nvidia-smi

Thu Jun 13 16:24:55 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0              29W / 250W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                         

# Train the model 
### Define loss function and optimizer for the model

In [12]:
# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Move model to GPU (if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = model.to(device)
model

cuda


MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(61127, 512, padding_idx=61126)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(61127, 512, padding_idx=61126)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLUActivation()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,),

In [13]:
# Define dataloaders for training and validation data
train_dataloader = DataLoader(train_dataset, batch_size=23, shuffle=True)
validation_dataloader = DataLoader(valid_dataset, batch_size=23, shuffle=True)
num_epochs = 1

for epoch in range(num_epochs):
    train_loss = 0.0
    valid_loss = 0.0
    
    # Training loop
    model.train()
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        decoder_input_ids = batch["decoder_input_ids"].to(device)
        decoder_attention_mask = batch["decoder_attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Free up unused memory on the GPU
    torch.cuda.empty_cache()
        
    # Validation loop
    model.eval()
    with torch.no_grad():
        for batch in tqdm(validation_dataloader, desc=f"Validation Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            decoder_input_ids = batch["decoder_input_ids"].to(device)
            decoder_attention_mask = batch["decoder_attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            outputs =model(input_ids, attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, labels=labels)
            loss = outputs.loss
            valid_loss += loss.item()
    
    # Calculate average loss for training and validation sets
    train_loss = train_loss / len(train_dataloader)
    valid_loss = valid_loss / len(validation_dataloader)
    
    # Print loss for the current epoch
    print(f"Epoch {epoch+1} Train Loss: {train_loss:.3f} | Valid Loss: {valid_loss:.3f}")

Training Epoch 1/1: 100%|██████████| 72135/72135 [5:19:53<00:00,  3.76it/s]
Validation Epoch 1/1: 100%|██████████| 23/23 [00:02<00:00,  9.65it/s]

Epoch 1 Train Loss: 0.529 | Valid Loss: 0.593





# Translation

In [14]:
def generate_translation(model, input_text):
    input_tokens = Tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt').to(device)
    input_ids = input_tokens['input_ids'].to(device).to(device)
    attention_mask = input_tokens['attention_mask'].to(device).to(device)
    
    # Generate translation using the model
    generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128, num_beams=4, early_stopping=True)
    
    # Decode the generated ids and return the translation
    translation = Tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return translation

generate_translation(model, 'कृत्रिम बुद्धिमत्ता मशीनों में मानव बुद्धिमत्ता का अनुकरण होता है, जिससे वे सीखना, तर्क करना, समस्या का समाधान करना, संवेदनशीलता और निर्णय जैसे मानव मनोवैज्ञानिक कार्य कर सकें।')

'Artificial▁intelligence in the artificial▁intelligence of human▁intelligence is an example of human▁intellectual activities, to learn, argue, solve the problem, to solve sensitiveness and decision.'

# Evaluations

In [15]:
# Define dataloader for test data
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Generate translations for the test set
generated_translations = []
for batch in test_dataloader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    generated = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128, num_beams=4, early_stopping=True)
    generated_translations.extend([Tokenizer.decode(g, skip_special_tokens=True) for g in generated])

# Get reference translations for the test set
reference_translations = [[d['hi']] for d in dataset['test']['translation']]

# Calculate BLEU score
bleu_score = corpus_bleu(reference_translations, generated_translations)
print(f"BLEU score: {bleu_score:.2f}")

BLEU score: 0.02


In [16]:
save_directory='/kaggle/working/'
model.save_pretrained(save_directory + 'opus-mt-hi-en')
Tokenizer.save_pretrained(save_directory + 'opus-mt-hi-en')

print(f"Models and tokenizers saved to {save_directory}")

Models and tokenizers saved to /kaggle/working/
