In [4]:
#Loading the devtest files of Flores-200 datasets

import os
import pandas as pd

def load_devtest_files(devtest_directory, specific_files):
    all_data = {}

    for file in specific_files:
        file_path = os.path.join(devtest_directory, file)
        
        if os.path.exists(file_path):
            try:
 
                with open(file_path, 'r', encoding='utf-8') as f:
                    sentences = f.readlines()
                sentences = [sentence.strip() for sentence in sentences if sentence.strip()] 

                all_data[file] = sentences
                print(f"Loaded: {file} with {len(sentences)} sentences")
            except Exception as e:
                print(f"Error loading {file}: {e}")
        else:
            print(f"File not found: {file}")

    return all_data


devtest_directory = r'D:\Machine translation model\flores200_dataset\flores200_dataset\devtest'
specific_files = [
    'eng_Latn.devtest',
    'hin_Deva.devtest',
    'mar_Deva.devtest',
    'guj_Gujr.devtest',
    'ben_Beng.devtest',
    'tel_Telu.devtest',
    'tam_Taml.devtest',
]


loaded_data = load_devtest_files(devtest_directory, specific_files)

Loaded: eng_Latn.devtest with 1012 sentences
Loaded: hin_Deva.devtest with 1012 sentences
Loaded: mar_Deva.devtest with 1012 sentences
Loaded: guj_Gujr.devtest with 1012 sentences
Loaded: ben_Beng.devtest with 1012 sentences
Loaded: tel_Telu.devtest with 1012 sentences
Loaded: tam_Taml.devtest with 1012 sentences


In [5]:
#Machine translation from English to 5 Indian languages['hin', 'mar', 'guj', 'ben', 'tel', 'tam'] of 150 random samples
#Saving the data into excel files by parallel_dataset_eng_lang

def save_random_samples_to_excel(loaded_data, sample_size=150):
    english_sentences = loaded_data['eng_Latn.devtest']
    
    output_directory = r'D:\Machine translation model\Machine Translation Model\OUTPUT MT'

    sample_size = min(sample_size, len(english_sentences))

    for lang_file, lang_sentences in loaded_data.items():
        if lang_file != 'eng_Latn.devtest':
    
            min_len = min(len(english_sentences), len(lang_sentences))
            if min_len < sample_size:
                print(f"Not enough sentences to sample for {lang_file}. Available: {min_len}")
                continue
            
            english_sample = pd.Series(english_sentences).sample(n=sample_size, random_state=42).reset_index(drop=True)
            lang_sample = pd.Series(lang_sentences).sample(n=sample_size, random_state=42).reset_index(drop=True)

            parallel_df = pd.DataFrame({
                'English': english_sample,
                'Translation': lang_sample
            })

            lang_code = lang_file.split('_')[0]
            excel_filename = os.path.join(output_directory, f'parallel_dataset_eng_{lang_code}.xlsx')
            parallel_df.to_excel(excel_filename, index=False)
            print(f'Saved: {excel_filename} with {len(parallel_df)} entries.')
save_random_samples_to_excel(loaded_data)


Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_hin.xlsx with 150 entries.
Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_mar.xlsx with 150 entries.
Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_guj.xlsx with 150 entries.
Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_ben.xlsx with 150 entries.
Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_tel.xlsx with 150 entries.
Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_tam.xlsx with 150 entries.


In [8]:
#loading the saved excel file of Machine translation

import pandas as pd
import os

def load_parallel_samples(output_directory, languages):
    sampled_data = {}
    for lang_code in languages:
        excel_filename = os.path.join(output_directory, f'parallel_dataset_eng_{lang_code}.xlsx')
        if os.path.exists(excel_filename):
            df = pd.read_excel(excel_filename)
            sampled_data[lang_code] = df
            print(f"Loaded: {excel_filename} with {len(df)} entries.")
        else:
            print(f"File not found: {excel_filename}")
    return sampled_data

output_directory = r'D:\Machine translation model\Machine Translation Model\OUTPUT MT'
languages = ['hin', 'mar', 'guj', 'ben', 'tel', 'tam'] 
sampled_data = load_parallel_samples(output_directory, languages)

Loaded: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_hin.xlsx with 150 entries.
Loaded: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_mar.xlsx with 150 entries.
Loaded: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_guj.xlsx with 150 entries.
Loaded: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_ben.xlsx with 150 entries.
Loaded: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_tel.xlsx with 150 entries.
Loaded: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_tam.xlsx with 150 entries.


In [48]:
import cohere
from nltk.translate.bleu_score import sentence_bleu

cohere_api_key = 'cohere api key'

co = cohere.Client(cohere_api_key)

def evaluate_translation(english_sentence, actual_translation, target_language):
    prompt = f"Please provide a fluent and natural translation of the following English sentence into {target_language}. Only use the following languages: Hindi, Marathi, Bengali, Telugu, Gujarati, and Tamil:\n\n{english_sentence}\n\nTranslation:"

    
    try:
        response = co.generate(
            model='command-r-plus',  
            prompt=prompt,
            max_tokens=100, 
            temperature=0.7, 
        )
        cohere_translation = response.generations[0].text.strip()
        bleu_score = sentence_bleu([actual_translation.split()], cohere_translation.split())
        return bleu_score, cohere_translation
    
    except Exception as e:
        print(f"Error evaluating translation: {e}")
        return None, None


def evaluate_samples_and_save_to_excel(sampled_data, output_directory):
    for lang_code, data in sampled_data.items():
        target_language = lang_code.split('_')[0]
        english_sentences = data['English']
        actual_translations = data['Translation']

        bleu_scores = []
        cohere_translations = []

        for english_sentence, actual_translation in zip(english_sentences, actual_translations):
            bleu_score, cohere_translation = evaluate_translation(english_sentence, actual_translation, target_language)
            bleu_scores.append(bleu_score)
            cohere_translations.append(cohere_translation)

            print(f"Evaluated: {english_sentence} -> {cohere_translation} | BLEU Score: {bleu_score}")

        evaluation_df = pd.DataFrame({
            'English': english_sentences,
            'Actual_Translation': actual_translations,
            'Cohere_Translation': cohere_translations,
            'BLEU_Score': bleu_scores
        })

        excel_filename = os.path.join(output_directory, f'evaluation_eng_{lang_code}.xlsx')
        evaluation_df.to_excel(excel_filename, index=False)
        print(f"Saved evaluation results to: {excel_filename}")


evaluate_samples_and_save_to_excel(sampled_data, output_directory)

Evaluated: Vatican City's population is around 800. It is the smallest independent country in the world and the country with the lowest population. -> वेटिकन सिटी की आबादी लगभग 800 है। यह दुनिया का सबसे छोटा स्वतंत्र देश है और आबादी के हिसाब से सबसे छोटा देश है। | BLEU Score: 3.9435663395006287e-78
Evaluated: All citizens of Vatican City are Roman Catholic. -> वेटिकन सिटी के सभी नागरिक रोमन कैथोलिक हैं। | BLEU Score: 0.8408964152537145
Evaluated: It has a notably wide variety of plant communities, due to its range of microclimates, differing soils and varying levels of altitude. -> यहाँ माइक्रोक्लाइमेट्स, अलग-अलग मिट्टी और बदलती ऊंचाई के स्तर के कारण पाए जाने वाले विविध प्रकार के पादप समुदाय हैं। | BLEU Score: 4.777946850498661e-155
Evaluated: The Amazon River is the second longest and the biggest river on Earth. It carries more than 8 times as much water as the second biggest river. -> अमेज़न नदी पृथ्वी पर दूसरी सबसे लंबी और सबसे बड़ी नदी है। यह दूसरी सबसे बड़ी नदी से 8 गुना ज़्यादा प