In [2]:
from transformers import pipeline

pipe = pipeline("translation", model="facebook/nllb-200-distilled-600M")

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to('cuda')

In [7]:
translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang="eng_Latn", tgt_lang='hin_Deva', max_length = 400)

In [8]:
import pandas as pd

In [15]:
df=pd.read_csv('./Bhagwad_Gita.csv')

In [16]:
df.head()

Unnamed: 0,ID,Chapter,Verse,Shloka,Transliteration,HinMeaning,EngMeaning,WordMeaning
0,BG1.1,1,1,धृतराष्ट्र उवाच |\nधर्मक्षेत्रे कुरुक्षेत्रे स...,dhṛtarāṣṭra uvāca .\ndharmakṣetre kurukṣetre s...,।।1.1।।धृतराष्ट्र ने कहा -- हे संजय ! धर्मभूमि...,1.1 Dhritarashtra said What did my people and...,1.1 धर्मक्षेत्रे on the holy plain? कुरुक्षेत्...
1,BG1.2,1,2,सञ्जय उवाच |\nदृष्ट्वा तु पाण्डवानीकं व्यूढं द...,sañjaya uvāca .\ndṛṣṭvā tu pāṇḍavānīkaṃ vyūḍha...,।।1.2।।संजय ने कहा -- पाण्डव-सैन्य की व्यूह रच...,1.2. Sanjaya said Having seen the army of the...,1.2 दृष्ट्वा having seen? तु indeed? पाण्डवानी...
2,BG1.3,1,3,पश्यैतां पाण्डुपुत्राणामाचार्य महतीं चमूम् |\n...,paśyaitāṃ pāṇḍuputrāṇāmācārya mahatīṃ camūm .\...,।।1.3।।हे आचार्य ! आपके बुद्धिमान शिष्य द्रुपद...,"1.3. ""Behold, O Teacher! this mighty army of t...",1.3 पश्य behold? एताम् this? पाण्डुपुत्राणाम् ...
3,BG1.4,1,4,अत्र शूरा महेष्वासा भीमार्जुनसमा युधि |\nयुयुध...,atra śūrā maheṣvāsā bhīmārjunasamā yudhi .\nyu...,।।1.4।।इस सेना में महान् धनुर्धारी शूर योद्धा ...,"1.4. Here are heroes, mighty archers, eal in b...",1.4 अत्र here? शूराः heroes? महेष्वासाः mighty...
4,BG1.5,1,5,धृष्टकेतुश्चेकितानः काशिराजश्च वीर्यवान् |\nपु...,dhṛṣṭaketuścekitānaḥ kāśirājaśca vīryavān .\np...,"।।1.5।।धृष्टकेतु, चेकितान, बलवान काशिराज, पुर...","1.5. ""Dhrishtaketu, chekitana and the valiant ...",1.5 धृष्टकेतुः Dhrishtaketu? चेकितानः Chekitan...


In [17]:
Shlokas=df['Shloka'].values
Hindi=df['HinMeaning'].values
English=df['EngMeaning'].values

In [18]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import nltk
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Install necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Define preprocessing and similarity functions
def preprocess(text):
    try:
        tokenizer = RegexpTokenizer(r'\w+')
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))

        words = tokenizer.tokenize(str(text))
        words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha()]
        words = [word for word in words if word not in stop_words]

        return ' '.join(words)
    except Exception as e:
        logging.error(f"Error in preprocessing text: {e}")
        return ''

def sentence_similarity(sentence1, sentence2):
    preprocessed_sentence1 = preprocess(sentence1)
    preprocessed_sentence2 = preprocess(sentence2)

    if not preprocessed_sentence1 or not preprocessed_sentence2:
        return 0.0  # Return 0 similarity for empty vocabulary

    try:
        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform([preprocessed_sentence1, preprocessed_sentence2])

        similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
        return similarity if not (similarity != similarity) else 0.0  # Check for NaN similarity
    except Exception as e:
        logging.error(f"Error processing record: {e}")
        return None  # Return None for problematic records

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [19]:
translatedEngHin=[]
sts_scores = []

for (shloka, hin) in zip(English, Hindi):
  prediction = translator(shloka)
  translation = prediction[0]['translation_text']
  print(f"English: {shloka}\nHindi: {translation}\n")
  translatedEngHin.append(translation)
  sts = sentence_similarity(hin, translation)
  sts_scores.append(sts)

English: 1.1 Dhritarashtra said  What did my people and the sons of Pandu do when they had assembled
together eager for battle on the holy plain of Kurukshetra, O Sanjaya.
Hindi: 1.1 धृतराष्ट्र ने कहा कि मेरे लोगों और पांडु के पुत्रों ने क्या किया जब वे कुरूक्षक्षेत्र के पवित्र मैदान पर युद्ध के लिए इकट्ठे हुए थे, हे संजय।

English: 1.2. Sanjaya said  Having seen the army of the Pandavas drawn up in battle-array,
King Duryodhana then approached his teacher (Drona) and spoke these words.
Hindi: संजय ने कहा कि युद्ध-शैली में पंडवों की सेना को तैयार करते हुए राजा दुर्योधन ने अपने गुरु (ड्रोना) के पास जाकर ये शब्द बोले।

English: 1.3. "Behold, O Teacher! this mighty army of the sons of Pandu,
arrayed by the son of Drupada, thy wise disciple.
Hindi: 1.3. "हे गुरु, देख, यह पांडु के पुत्रों की यह शक्तिशाली सेना, जो तेरे बुद्धिमान शिष्य द्रूपदा के पुत्र द्वारा तैयार है।

English: 1.4. Here are heroes, mighty archers, eal in battle to Bhima
and Arjuna, Yoyudhana (Satyaki), Virata and Drupada, o

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


English: 1.10. "This army of ours marshalled by Bhishma is insufficient,
whereas that army of theirs marshelled by Bhima is sufficient.
Hindi: 1.10. "भीष्मा के नेतृत्व में हमारी यह सेना अपर्याप्त है, जबकि भीमा के नेतृत्व में उनकी सेना अपर्याप्त है।

English: 1.11. "Therefore do ye all, stationed in your respective positions,
in the several divisions of the army, protect Bhishma alone."
Hindi: 1.11. "इसलिए आप सभी, अपनी-अपनी स्थिति में तैनात, सेना के विभिन्न विभागों में, केवल भीष्म की रक्षा करें। "

English: 1.12. His glorious grandsire (Bhishma), the oldest of the Kauravas,
in order to cheer Duryodhana, now roared like a lion, and blew his conch.
Hindi: 1.12. उनके गौरवशाली पोते (भीष्मा), कौरवों में सबसे पुराने, दुर्योधन को जयकार करने के लिए, अब शेर की तरह रोए और अपनी खाल उड़ा दी।

English: 1.13. Then (following Bhishma), conches and kettledrums, tabors,
drums and cow horns blared forth ite suddenly (from the Kaurava side)
and the sound was tremendous.
Hindi: 1.13। फिर (भीष्मा के बाद) अच

In [20]:
!pip install nltk



In [21]:
import nltk
from nltk.translate.bleu_score import corpus_bleu


In [22]:
references = [[hindi] for hindi in Hindi]  # Create reference list of lists
candidates = translatedEngHin  # Model translations


In [23]:
bleu_score = corpus_bleu(references, candidates)
print(f"BLEU score: {bleu_score:.4f}")

BLEU score: 0.3971


In [24]:
# Calculate the overall STS score
results ={"STS_Score":sts_scores}
df = pd.DataFrame(results)

overall_score = pd.Series(sts_scores).mean()
print(f"Overall STS Score: {overall_score}")

Overall STS Score: 0.2569502969535173
