In [28]:
from transformers import pipeline

pipe = pipeline("text2text-generation", model="facebook/m2m100_418M")

In [29]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/m2m100_418M").to('cuda')

In [30]:
en_texts = [
    "Life is like a chocolate box.",
    "The sun rises in the east.",
    "Knowledge is power.",
]

# translate English to Hindi
for en_text in en_texts:
    encoded_en = tokenizer(en_text, return_tensors="pt", padding=True, truncation=True).to('cuda')
    generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.get_lang_id("hi")).to('cuda')
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    print(f"English: {en_text}\nHindi: {translation}\n")


English: Life is like a chocolate box.
Hindi: जीवन एक चॉकलेट बॉक्स की तरह है।

English: The sun rises in the east.
Hindi: सूरज पूर्व में उगता है।

English: Knowledge is power.
Hindi: ज्ञान शक्ति है।



In [31]:
import pandas as pd

In [32]:
df=pd.read_csv('./Bhagwad_Gita.csv')

In [33]:
df.head()

Unnamed: 0,ID,Chapter,Verse,Shloka,Transliteration,HinMeaning,EngMeaning,WordMeaning
0,BG1.1,1,1,धृतराष्ट्र उवाच |\nधर्मक्षेत्रे कुरुक्षेत्रे स...,dhṛtarāṣṭra uvāca .\ndharmakṣetre kurukṣetre s...,।।1.1।।धृतराष्ट्र ने कहा -- हे संजय ! धर्मभूमि...,1.1 Dhritarashtra said What did my people and...,1.1 धर्मक्षेत्रे on the holy plain? कुरुक्षेत्...
1,BG1.2,1,2,सञ्जय उवाच |\nदृष्ट्वा तु पाण्डवानीकं व्यूढं द...,sañjaya uvāca .\ndṛṣṭvā tu pāṇḍavānīkaṃ vyūḍha...,।।1.2।।संजय ने कहा -- पाण्डव-सैन्य की व्यूह रच...,1.2. Sanjaya said Having seen the army of the...,1.2 दृष्ट्वा having seen? तु indeed? पाण्डवानी...
2,BG1.3,1,3,पश्यैतां पाण्डुपुत्राणामाचार्य महतीं चमूम् |\n...,paśyaitāṃ pāṇḍuputrāṇāmācārya mahatīṃ camūm .\...,।।1.3।।हे आचार्य ! आपके बुद्धिमान शिष्य द्रुपद...,"1.3. ""Behold, O Teacher! this mighty army of t...",1.3 पश्य behold? एताम् this? पाण्डुपुत्राणाम् ...
3,BG1.4,1,4,अत्र शूरा महेष्वासा भीमार्जुनसमा युधि |\nयुयुध...,atra śūrā maheṣvāsā bhīmārjunasamā yudhi .\nyu...,।।1.4।।इस सेना में महान् धनुर्धारी शूर योद्धा ...,"1.4. Here are heroes, mighty archers, eal in b...",1.4 अत्र here? शूराः heroes? महेष्वासाः mighty...
4,BG1.5,1,5,धृष्टकेतुश्चेकितानः काशिराजश्च वीर्यवान् |\nपु...,dhṛṣṭaketuścekitānaḥ kāśirājaśca vīryavān .\np...,"।।1.5।।धृष्टकेतु, चेकितान, बलवान काशिराज, पुर...","1.5. ""Dhrishtaketu, chekitana and the valiant ...",1.5 धृष्टकेतुः Dhrishtaketu? चेकितानः Chekitan...


In [34]:
Shlokas=df['Shloka'].values
Hindi=df['HinMeaning'].values
English=df['EngMeaning'].values

In [35]:
import numpy as np
import csv


In [36]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import nltk
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Install necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Define preprocessing and similarity functions
def preprocess(text):
    try:
        tokenizer = RegexpTokenizer(r'\w+')
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))

        words = tokenizer.tokenize(str(text))
        words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha()]
        words = [word for word in words if word not in stop_words]

        return ' '.join(words)
    except Exception as e:
        logging.error(f"Error in preprocessing text: {e}")
        return ''

def sentence_similarity(sentence1, sentence2):
    preprocessed_sentence1 = preprocess(sentence1)
    preprocessed_sentence2 = preprocess(sentence2)

    if not preprocessed_sentence1 or not preprocessed_sentence2:
        return 0.0  # Return 0 similarity for empty vocabulary

    try:
        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform([preprocessed_sentence1, preprocessed_sentence2])

        similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
        return similarity if not (similarity != similarity) else 0.0  # Check for NaN similarity
    except Exception as e:
        logging.error(f"Error processing record: {e}")
        return None  # Return None for problematic records

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [37]:
import time

start_time = time.time()
translatedEngHin = []
sts_scores = []

for (hin, shloka) in zip(Hindi[:100],English[:100]):
    encoded_en = tokenizer(shloka, return_tensors="pt", padding=True, truncation=True).to('cuda')
    generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.get_lang_id("hi")).to('cuda')
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    print(f"English: {shloka}\nHindi: {translation}\n")
    translatedEngHin.append(translation)
    sts = sentence_similarity(hin, translation)
    sts_scores.append(sts)

end_time = time.time()
total_time = end_time - start_time

print(f"Total time taken for translating first 100 lines: {total_time:.2f} seconds")


English: 1.1 Dhritarashtra said  What did my people and the sons of Pandu do when they had assembled
together eager for battle on the holy plain of Kurukshetra, O Sanjaya.
Hindi: 1. Dhritarashtra ने कहा कि मेरे लोगों और पंडू के पुत्रों ने क्या किया जब वे एक साथ इकट्ठा हुए थे और कुरक्सा के पवित्र मंजिल पर लड़ाई के लिए उत्सुक थे, O Sanjaya।

English: 1.2. Sanjaya said  Having seen the army of the Pandavas drawn up in battle-array,
King Duryodhana then approached his teacher (Drona) and spoke these words.
Hindi: सानियाया ने कहा कि पांडावस की सेना को लड़ाकू रस्सी में उठाकर देखा, राजा डुरीओडाना फिर अपने शिक्षक (ड्रोना) के पास आया और इन शब्दों को बोला।

English: 1.3. "Behold, O Teacher! this mighty army of the sons of Pandu,
arrayed by the son of Drupada, thy wise disciple.
Hindi: 1:3 "देखो, हे शिक्षक, पंडू के पुत्रों की यह शक्तिशाली सेना, ड्रूपादा के पुत्र, तेरे बुद्धिमान अनुयायियों द्वारा तैयार किया गया है।

English: 1.4. Here are heroes, mighty archers, eal in battle to Bhima
and Arjuna, 

In [38]:
file_path = '/content/data1.csv'
with open(file_path, 'w', newline="") as file:
    writer = csv.writer(file)
    writer.writerows(translatedEngHin)

In [39]:
translatedEngHin2 = []
cnt = 100
for (hin, shloka) in zip(Hindi[100:500],English[100:500]):
    encoded_en = tokenizer(shloka, return_tensors="pt", padding=True, truncation=True).to('cuda')
    generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.get_lang_id("hi")).to('cuda')
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    cnt = cnt + 1
    print(f"{cnt}\nEnglish: {shloka}\nHindi: {translation}\n")
    translatedEngHin2.append(translation)
    sts = sentence_similarity(hin, translation)
    sts_scores.append(sts)

101
English: 2.54 Arjuna said  What, O Krishna, is the description of him who has steady wisdom, and is merged in the superconscious state? How does one of steady wisdom speak, how does he sit, how does he walk?
Hindi: 54 आर्जोना ने कहा, हे क्रिश्ना, वह व्यक्ति का क्या वर्णन है जिसके पास स्थिर ज्ञान है, और वह सुपर-जागरूकता की स्थिति में मिश्रित है?

102
English: 2.55 The Blessed Lord said  When a man completely casts off, O Arjuna, all the desires of the mind and is satisfied in the Self by the Self, then is he said to be one of steady wisdom.
Hindi: 55 परमेश्वर ने कहा, “जब कोई आदमी, आर्जुना, मन की सभी इच्छाओं को पूरी तरह से खारिज कर देता है और स्वयं के द्वारा स्वयं में संतुष्ट हो जाता है, तो यह कहा जाता है कि वह स्थिर ज्ञान में से एक है।

103
English: 2.56 He whose mind is not shaken by adversity, who does not hanker after pleasures, and is free from attachment, fear and anger, is called a sage of steady wisdom.
Hindi: ﴾ 56 ﴿ और जिसका मन दुःख से घबरा हुआ नहीं है, और जिसका मन खुशी की त

In [40]:
file_path = '/content/data2.csv'
with open(file_path, 'w', newline="") as file:
    writer = csv.writer(file)
    writer.writerows(translatedEngHin2)

In [41]:
translatedEngHin3 = []
cnt = 500
for (hin, shloka) in zip(Hindi[500:], English[500:]):
    encoded_en = tokenizer(shloka, return_tensors="pt", padding=True, truncation=True).to('cuda')
    generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.get_lang_id("hi")).to('cuda')
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    cnt = cnt + 1
    print(f"{cnt}\nEnglish: {shloka}\nHindi: {translation}\n")
    translatedEngHin3.append(translation)
    sts = sentence_similarity(hin, translation)
    sts_scores.append(sts)

501
English: 13.12 Constancy in Self-knowledge, perception of the end of true knowledge  this is declared to be knowledge, and what is opposed to it is ignorance.
Hindi: 13.12 आत्म-ज्ञान में स्थिरता, सच्चे ज्ञान के अंत की धारणा यह ज्ञान के रूप में घोषित की जाती है, और इसके विपरीत अज्ञानता है।

502
English: 13.13 I will declare that which has to be known, knowing which one attains to immortality, the beginningless supreme Brahman, called neither being nor non-being.
Hindi: 13.13 मैं यह घोषणा करूँगा कि यह ज्ञात होना चाहिए, यह जानकर कि कौन अनमृत्युता तक पहुंचता है, शुरुआतहीन सर्वोच्च ब्रहमैन, जिसे न तो होना और न ही होना कहा जाता है।

503
English: 13.14 With hands and feet everywhere, with eyes, heads and mouths everywhere, with ears everywhere, He exists in the worlds enveloping all.
Hindi: 13:14 हर जगह हाथों और पैरों के साथ, हर जगह आँखों, सिरों और मुंहों के साथ, हर जगह कानों के साथ, वह सभी को घेरने की दुनिया में मौजूद है।

504
English: 13.15 Shining by the functions of all the senses, ye

In [42]:
file_path = '/content/data3.csv'
with open(file_path, 'w', newline="") as file:
    writer = csv.writer(file)
    writer.writerows(translatedEngHin3)

In [43]:
import csv
translatedEngHin = []

with open('/content/data1.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        translatedEngHin.append(row)

with open('/content/data2.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        translatedEngHin.append(row)

with open('/content/data3.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        translatedEngHin.append(row)

In [44]:
!pip install nltk



In [45]:
import nltk
from nltk.translate.bleu_score import corpus_bleu


In [46]:
references = [[hindi] for hindi in Hindi]  # Create reference list of lists
candidates = translatedEngHin  # Model translations

In [47]:
import numpy as np
from nltk.translate.bleu_score import sentence_bleu

bleu_scores = []
for i in range(len(references)):
    bleu_scores.append(sentence_bleu(references[i], candidates[i], weights=(0.25, 0.25, 0.25, 0.25)))

bleu_score = np.mean(bleu_scores)
print(f"BLEU score: {bleu_score:.4f}")


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU score: 0.2963


In [48]:
# Calculate the overall STS score
results ={"STS_Score":sts_scores}
df = pd.DataFrame(results)

overall_score = pd.Series(sts_scores).mean()
print(f"Overall STS Score: {overall_score}")

Overall STS Score: 0.18015356318127096
