In [None]:
## Make sure to change path to the directory where you have cloned IndicTransTokenizer

# %pip install accelerate
# %pip install nltk
# %pip install mosestokenizer
# !pip install evaluate
# !pip install rouge_score
# % pip install  nltk
# !git clone https://github.com/VarunGumma/IndicTransTokenizer
# %cd IndicTransTokenizer
# !pip install --editable /path_to/IndicTransTokenizer 


In [3]:
def openRandomDataset(filepath):
    with open(filepath,'r',encoding='utf-8') as f:
        text = f.readlines()
        text = [x.strip().replace('\u200d', '') for x in text]
        
    return text

## Loading Hindi,English and Marathi Dataset

In [4]:
import os
hindi_1000 = openRandomDataset('../data/hindi_sentences.txt')
eng_1000 = openRandomDataset('../data/eng_sentences.txt')
mar_1000 = openRandomDataset('../data/marathi_sentences.txt')

In [5]:
en_indic = "ai4bharat/indictrans2-en-indic-dist-200M"  # ai4bharat/indictrans2-en-indic-dist-200M
indic_en = "ai4bharat/indictrans2-indic-en-dist-200M"  # ai4bharat/indictrans2-indic-en-dist-200M
indic_indic = "ai4bharat/indictrans2-indic-indic-dist-320M"  # ai4bharat/indictrans2-indic-indic-dist-320M

## Checking for GPU

In [6]:
import torch
print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

True


## Loading tokenizer , processor and model

In [7]:
import torch
from IndicTransTokenizer import IndicTransTokenizer, IndicProcessor
from transformers import AutoModelForSeq2SeqLM
def tokenizer_model_generate(address,direction):
    tokenizer = IndicTransTokenizer(direction=direction)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        address,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
    )
    model.to(device)
    return tokenizer,model

  from .autonotebook import tqdm as notebook_tqdm


## Generation different tokenizer for different models

In [8]:
indi_en_tokenizer,indi_en_model = tokenizer_model_generate(indic_en,'indic-en')
en_indi_tokenizer,en_indi_model = tokenizer_model_generate(en_indic,'en-indic')
indi_indi_tokenizer,indi_indi_model = tokenizer_model_generate(indic_indic,'indic-indic')

  return self.fget.__get__(instance, owner)()


In [9]:
ip = IndicProcessor(inference=True)


## Batching the data

In [10]:
def process_by_10(text_list,srclang,tgtlang,model,tokenizer):
    batch_size = 10
    num_batches = (len(text_list) + batch_size - 1) // batch_size
    processed_texts = []
    ip = IndicProcessor()

    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, len(text_list))
        batch_texts = text_list[batch_start:batch_end]

        batch = ip.preprocess_batch(batch_texts, src_lang=srclang, tgt_lang=tgtlang)
        batch = tokenizer(batch, src=True, return_tensors="pt").to('cuda')

        with torch.inference_mode():
            outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)

        outputs = tokenizer.batch_decode(outputs, src=False)
        outputs = ip.postprocess_batch(outputs, lang=srclang)
        processed_texts.extend(outputs)

        # Free up memory if needed
        del batch, outputs

    return processed_texts


## Translating the randomly selected data

In [11]:
import time
curr_time = time.time()
translations_hi_eng = process_by_10(hindi_1000,srclang="hin_Deva",tgtlang="eng_Latn",model=indi_en_model,tokenizer=indi_en_tokenizer)
print("Time taken for 1000 translations: ", time.time()-curr_time)
curr_time = time.time()
translations_eng_hi = process_by_10(eng_1000,srclang="eng_Latn",tgtlang="hin_Deva",model=en_indi_model,tokenizer=en_indi_tokenizer)
print("Time taken for 1000 translations: ", time.time()-curr_time)
curr_time = time.time()
translations_mar_hi = process_by_10(mar_1000,srclang="mar_Deva",tgtlang="hin_Deva",model=indi_indi_model,tokenizer=indi_indi_tokenizer)
print("Time taken for 1000 translations: ", time.time()-curr_time)
curr_time = time.time()
translations_hi_mar = process_by_10(hindi_1000,srclang="hin_Deva",tgtlang="mar_Deva",model=indi_indi_model,tokenizer=indi_indi_tokenizer)
print("Time taken for 1000 translations: ", time.time()-curr_time)


Time taken for 1000 translations:  168.82941794395447
Time taken for 1000 translations:  255.84121775627136
Time taken for 1000 translations:  380.68901014328003
Time taken for 1000 translations:  521.7725546360016


## Saving these files

In [19]:
with open("../IndiTrans/indi_eng_hindi.txt", "w",encoding='utf-8') as txt_file:
    for line in translations_eng_hi:
        txt_file.write(line+'\n')
with open("../IndiTrans/indi_hindi_eng.txt", "w",encoding='utf-8') as txt_file:
    for line in translations_hi_eng:
        txt_file.write(line+'\n')
with open("../IndiTrans/indi_hindi_mar.txt", "w",encoding='utf-8') as txt_file:
    for line in translations_hi_mar:
        txt_file.write(line+'\n')
with open("../IndiTrans/indi_mar_hindi.txt", "w",encoding='utf-8') as txt_file:
    for line in translations_mar_hi:
        txt_file.write(line+'\n')

## BLEU and Rouge Score calculation

In [27]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
def bleu_score(ref,pred):
    score = 0
    cc = SmoothingFunction()
    for i in range(len(ref)):
       references = ref[i].split()
       candidate = pred[i].split() 
       score += sentence_bleu([references], candidate,smoothing_function=cc.method2)      
    return score/len(ref)

def rouge_score(ref,pred):
    rougeS = evaluate.load('rouge')
    rouge = rougeS.compute(references=ref, predictions=pred)
    return rouge

print("Marahti to Hindi")
print("BLEU:",bleu_score(hindi_1000,translations_mar_hi))
print(rouge_score(hindi_1000,translations_mar_hi))

print("Hindi to Marathi")
print("BLEU:",bleu_score(mar_1000,translations_hi_mar))
print(rouge_score(mar_1000,translations_hi_mar))

print("Hindi to English")
print("BLEU:",bleu_score(eng_1000,translations_hi_eng))
print(rouge_score(eng_1000,translations_hi_eng))

print("English to Hindi")
print("BLEU:",bleu_score(hindi_1000,translations_eng_hi))
print(rouge_score(hindi_1000,translations_eng_hi))

Marahti to Hindi
BLEU: 0.275443319206238
{'rouge1': 0.10458888888888888, 'rouge2': 0.022007142857142857, 'rougeL': 0.10369047619047622, 'rougeLsum': 0.10337976190476192}
Hindi to Marathi
BLEU: 0.20845461363921827
{'rouge1': 0.08528809523809525, 'rouge2': 0.018333333333333333, 'rougeL': 0.08428214285714285, 'rougeLsum': 0.08398809523809524}
Hindi to English
BLEU: 0.4004007746985462
{'rouge1': 0.7224173762505202, 'rouge2': 0.5128040839047556, 'rougeL': 0.676571144630105, 'rougeLsum': 0.6762574177693983}
English to Hindi
BLEU: 0.3284391709890271
{'rouge1': 0.11264029304029305, 'rouge2': 0.022548484848484845, 'rougeL': 0.11104047619047622, 'rougeLsum': 0.11068690476190479}
