In [11]:
#Loading the devtest files for Flores-200 datasets

import os
import pandas as pd

def load_devtest_files(devtest_directory, specific_files):
    all_data = {}

    for file in specific_files:
        file_path = os.path.join(devtest_directory, file)
        
        if os.path.exists(file_path):
            try:
 
                with open(file_path, 'r', encoding='utf-8') as f:
                    sentences = f.readlines()
                sentences = [sentence.strip() for sentence in sentences if sentence.strip()]  # Remove empty lines and strip whitespace

                all_data[file] = sentences
                print(f"Loaded: {file} with {len(sentences)} sentences")
            except Exception as e:
                print(f"Error loading {file}: {e}")
        else:
            print(f"File not found: {file}")

    return all_data


devtest_directory = r'D:\Machine translation model\flores200_dataset\flores200_dataset\devtest'
specific_files = [
    'eng_Latn.devtest',
    'hin_Deva.devtest',
    'mar_Deva.devtest',
    'guj_Gujr.devtest',
    'ben_Beng.devtest',
    'tel_Telu.devtest',
    'tam_Taml.devtest',
]


loaded_data = load_devtest_files(devtest_directory, specific_files)

Loaded: eng_Latn.devtest with 1012 sentences
Loaded: hin_Deva.devtest with 1012 sentences
Loaded: mar_Deva.devtest with 1012 sentences
Loaded: guj_Gujr.devtest with 1012 sentences
Loaded: ben_Beng.devtest with 1012 sentences
Loaded: tel_Telu.devtest with 1012 sentences
Loaded: tam_Taml.devtest with 1012 sentences


In [13]:
#Machine translation from English to 5 Indian languages['hin', 'mar', 'guj', 'ben', 'tel', 'tam'] of 150 random samples
#Saving the data into excel files by parallel_dataset_eng_lang

def save_random_samples_to_excel(loaded_data, sample_size=150):
    english_sentences = loaded_data['eng_Latn.devtest']
    
    output_directory = r'D:\Machine translation model\Machine Translation Model\OUTPUT MT'

    sample_size = min(sample_size, len(english_sentences))

    for lang_file, lang_sentences in loaded_data.items():
        if lang_file != 'eng_Latn.devtest':
    
            min_len = min(len(english_sentences), len(lang_sentences))
            if min_len < sample_size:
                print(f"Not enough sentences to sample for {lang_file}. Available: {min_len}")
                continue
            
            english_sample = pd.Series(english_sentences).sample(n=sample_size, random_state=42).reset_index(drop=True)
            lang_sample = pd.Series(lang_sentences).sample(n=sample_size, random_state=42).reset_index(drop=True)

            parallel_df = pd.DataFrame({
                'English': english_sample,
                'Translation': lang_sample
            })

            lang_code = lang_file.split('_')[0]
            excel_filename = os.path.join(output_directory, f'parallel_dataset_eng_{lang_code}.xlsx')
            parallel_df.to_excel(excel_filename, index=False)
            print(f'Saved: {excel_filename} with {len(parallel_df)} entries.')
save_random_samples_to_excel(loaded_data)


Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_hin.xlsx with 150 entries.
Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_mar.xlsx with 150 entries.
Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_guj.xlsx with 150 entries.
Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_ben.xlsx with 150 entries.
Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_tel.xlsx with 150 entries.
Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_tam.xlsx with 150 entries.


In [15]:
#loading the saved excel file of Machine translation

import pandas as pd
import os

def load_parallel_samples(output_directory, languages):
    sampled_data = {}
    for lang_code in languages:
        excel_filename = os.path.join(output_directory, f'parallel_dataset_eng_{lang_code}.xlsx')
        if os.path.exists(excel_filename):
            df = pd.read_excel(excel_filename)
            sampled_data[lang_code] = df
            print(f"Loaded: {excel_filename} with {len(df)} entries.")
        else:
            print(f"File not found: {excel_filename}")
    return sampled_data

output_directory = r'D:\Machine translation model\Machine Translation Model\OUTPUT MT'
languages = ['hin', 'mar', 'guj', 'ben', 'tel', 'tam'] 

sampled_data = load_parallel_samples(output_directory, languages)

Loaded: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_hin.xlsx with 150 entries.
Loaded: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_mar.xlsx with 150 entries.
Loaded: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_guj.xlsx with 150 entries.
Loaded: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_ben.xlsx with 150 entries.
Loaded: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_tel.xlsx with 150 entries.
Loaded: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_tam.xlsx with 150 entries.


In [17]:
import openai
import backoff
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd
import os
import time

openai.api_key = 'API KEY'

def evaluate_translation(english_sentence, actual_translation, target_language):
    prompt = (
        f"Translate the following English sentence into **{target_language} only**, ensuring the translation is natural and fluent. "
        "Please only use one of these languages: Hindi, Marathi, Bengali, Telugu, Gujarati, or Tamil. "
        "Avoid mixing any other languages or dialects in the translation."
        "Make sure that the translation is grammatically correct and contextually appropriate.\n\n"
        f"English sentence: {english_sentence}\n"
        f"Translation in {target_language}:"
    )

    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a multilingual translator."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=200,
            temperature=0.3,
        )
        gpt_translation = response['choices'][0]['message']['content'].strip()  # Direct output, no cleaning
        if gpt_translation:
            # Apply smoothing for BLEU score calculation
            smoothie = SmoothingFunction().method4
            bleu_score = sentence_bleu([actual_translation.split()], gpt_translation.split(), smoothing_function=smoothie)
        else:
            bleu_score = 0
        
        return bleu_score, gpt_translation

    except Exception as e:
        print(f"Error evaluating translation for sentence '{english_sentence}': {e}")
        return 0.0, ""

@backoff.on_exception(backoff.expo, openai.error.RateLimitError, max_tries=5)
def generate_translation(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[{"role": "system", "content": "You are a multilingual translator."},
                  {"role": "user", "content": prompt}],
        max_tokens=200,
        temperature=0.3,
    )
    return response['choices'][0]['message']['content'].strip()


from concurrent.futures import ThreadPoolExecutor

def evaluate_translations_concurrently(english_sentences, actual_translations, target_language):
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(evaluate_translation, eng, trans, target_language)
                   for eng, trans in zip(english_sentences, actual_translations)]
        results = [f.result() for f in futures]
    return zip(*results)  # Returns BLEU scores and translations


def evaluate_translation_until_improved(english_sentence, actual_translation, target_language, attempts=3):
    for _ in range(attempts):
        bleu_score, gpt_translation = evaluate_translation(english_sentence, actual_translation, target_language)
        if bleu_score > 0:
            return bleu_score, gpt_translation
    return bleu_score, gpt_translation



def evaluate_samples_and_save_to_excel(sampled_data, output_directory):
    for lang_code, data in sampled_data.items():
        target_language = lang_code.split('_')[0]
        english_sentences = data['English']
        actual_translations = data['Translation']

        bleu_scores = []
        gpt_translations = []

        for english_sentence, actual_translation in zip(english_sentences, actual_translations):
            bleu_score, gpt_translation = evaluate_translation(english_sentence, actual_translation, target_language)
            bleu_scores.append(bleu_score)
            gpt_translations.append(gpt_translation)

            print(f"Evaluated: {english_sentence} -> {gpt_translation} | BLEU Score: {bleu_score}")
            time.sleep(1) 

        evaluation_df = pd.DataFrame({
            'English': english_sentences,
            'Actual_Translation': actual_translations,
            'GPT_Translation': gpt_translations,
            'BLEU_Score': bleu_scores
        })

        excel_filename = os.path.join(output_directory, f'evaluation_eng_{lang_code}_gpt4ominiw.xlsx')
        
        with pd.ExcelWriter(excel_filename, engine='xlsxwriter') as writer:
            evaluation_df.to_excel(writer, index=False, sheet_name='Evaluation')

        print(f"Saved evaluation results to: {excel_filename}")

evaluate_samples_and_save_to_excel(sampled_data, output_directory)

Evaluated: Vatican City's population is around 800. It is the smallest independent country in the world and the country with the lowest population. -> वेटिकन सिटी की जनसंख्या लगभग 800 है। यह दुनिया का सबसे छोटा स्वतंत्र देश है और सबसे कम जनसंख्या वाला देश भी है। | BLEU Score: 0.3839817133079349
Evaluated: All citizens of Vatican City are Roman Catholic. -> वेटिकन सिटी के सभी नागरिक रोमन कैथोलिक हैं। | BLEU Score: 0.8408964152537145
Evaluated: It has a notably wide variety of plant communities, due to its range of microclimates, differing soils and varying levels of altitude. -> यहाँ विभिन्न सूक्ष्म जलवायु, भिन्न मिट्टियों और ऊँचाई के विभिन्न स्तरों के कारण पौधों के समुदायों की उल्लेखनीय विविधता है। | BLEU Score: 0.44441837360913394
Evaluated: The Amazon River is the second longest and the biggest river on Earth. It carries more than 8 times as much water as the second biggest river. -> अमेज़न नदी पृथ्वी की दूसरी सबसे लंबी और सबसे बड़ी नदी है। यह दूसरी सबसे बड़ी नदी से 8 गुना अधिक पानी 

In [28]:
import pandas as pd
import os

def load_specific_excel_files(input_directory, specific_files):
    loaded_data = {}

    for specific_file in specific_files:
        excel_path = os.path.join(input_directory, f'{specific_file}.xlsx')
        
        if os.path.exists(excel_path):
            try:
                df = pd.read_excel(excel_path)
                loaded_data[specific_file] = df
                print(f"Loaded data from: {excel_path}")
            except Exception as e:
                print(f"Error loading {excel_path}: {e}")
        else:
            print(f"File not found: {excel_path}")

    return loaded_data

input_directory = 'D:\Machine translation model\Machine Translation Model\OUTPUT MT'  
specific_files = ['evaluation_eng_ben_gpt4ominiw', 'evaluation_eng_mar_gpt4ominiw', 'evaluation_eng_tel_gpt4ominiw']  
loaded_data = load_specific_excel_files(input_directory, specific_files)

Loaded data from: D:\Machine translation model\Machine Translation Model\OUTPUT MT\evaluation_eng_ben_gpt4ominiw.xlsx
Loaded data from: D:\Machine translation model\Machine Translation Model\OUTPUT MT\evaluation_eng_mar_gpt4ominiw.xlsx
Loaded data from: D:\Machine translation model\Machine Translation Model\OUTPUT MT\evaluation_eng_tel_gpt4ominiw.xlsx


In [42]:
import pandas as pd
import openai
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Set your OpenAI API key
openai.api_key = 'API KEY'

def update_translations_with_zero_bleu(input_file, output_file):
    # Load the Excel file
    df = pd.read_excel(input_file)

    # Check the columns' names to adapt if needed
    print("Columns in the Excel file:", df.columns)

    # Ensure required columns exist
    required_columns = ['English', 'Actual_Translation', 'GPT_Translation', 'BLEU_Score']
    for col in required_columns:
        if col not in df.columns:
            print(f"Error: Required column '{col}' not found in the input file.")
            return

    # Identify rows with a BLEU score of 0
    zero_bleu_rows = df[df['BLEU_Score'] == 0]

    if zero_bleu_rows.empty:
        print("No rows with a BLEU score of 0 found.")
        return

    # Determine target language based on input file name
    if 'evaluation_eng_ben_gpt4ominiw' in input_file:
        target_language = 'Bengali'
    elif 'evaluation_eng_mar_gpt4ominiw' in input_file:
        target_language = 'Marathi'
    elif 'evaluation_eng_tel_gpt4ominiw' in input_file:
        target_language = 'Telugu'
    else:
        print("Error: Unrecognized input file for setting target language.")
        return

    for index, row in zero_bleu_rows.iterrows():
        english_sentence = row['English']
        actual_translation = row['Actual_Translation']

        # Regenerate the translation
        gpt_translation = regenerate_translation(english_sentence, target_language)

        # Calculate the new BLEU score
        if gpt_translation:
            new_bleu_score = calculate_bleu_score(actual_translation, gpt_translation)

            # Update the DataFrame
            df.at[index, 'GPT_Translation'] = gpt_translation
            df.at[index, 'BLEU_Score'] = new_bleu_score

            print(f"Regenerated: {english_sentence} -> {gpt_translation} | New BLEU Score: {new_bleu_score}")
        else:
            print(f"Failed to regenerate translation for: {english_sentence}")

    # Save the updated DataFrame back to an Excel file
    df.to_excel(output_file, index=False)
    print(f"Updated data saved to: {output_file}")

# Example usage for multiple files
files = [
    {
        'input_file': r'D:\Machine translation model\Machine Translation Model\OUTPUT MT\evaluation_eng_ben_gpt4ominiw.xlsx',
        'output_file': r'D:\Machine translation model\Machine Translation Model\OUTPUT MT\evaluation_eng_ben_gpt4ominiw_updated.xlsx'
    },
    {
        'input_file': r'D:\Machine translation model\Machine Translation Model\OUTPUT MT\evaluation_eng_mar_gpt4ominiw.xlsx',
        'output_file': r'D:\Machine translation model\Machine Translation Model\OUTPUT MT\evaluation_eng_mar_gpt4ominiw_updated.xlsx'
    },
    {
        'input_file': r'D:\Machine translation model\Machine Translation Model\OUTPUT MT\evaluation_eng_tel_gpt4ominiw.xlsx',
        'output_file': r'D:\Machine translation model\Machine Translation Model\OUTPUT MT\evaluation_eng_tel_gpt4ominiw_updated.xlsx'
    }
]

for file in files:
    update_translations_with_zero_bleu(file['input_file'], file['output_file'])

Columns in the Excel file: Index(['English', 'Actual_Translation', 'GPT_Translation', 'BLEU_Score'], dtype='object')
Regenerated: During his trip, Iwasaki ran into trouble on many occasions. -> তার সফরের সময়, ইওয়াসাকি অনেকবার সমস্যায় পড়েছিল। | New BLEU Score: 0
Updated data saved to: D:\Machine translation model\Machine Translation Model\OUTPUT MT\evaluation_eng_ben_gpt4ominiw_updated.xlsx
Columns in the Excel file: Index(['English', 'Actual_Translation', 'GPT_Translation', 'BLEU_Score'], dtype='object')
Regenerated: Cuomo, 53, began his governorship earlier this year and signed a bill last month legalizing same-sex marriage. -> कुओमो, ५३, याने या वर्षाच्या सुरुवातीला आपल्या राज्यपालपदाची शपथ घेतली आणि गेल्या महिन्यात समलिंगी विवाहाला कायदेशीर मानणारा विधेयकावर स्वाक्षरी केली. | New BLEU Score: 0.2205432851826186
Updated data saved to: D:\Machine translation model\Machine Translation Model\OUTPUT MT\evaluation_eng_mar_gpt4ominiw_updated.xlsx
Columns in the Excel file: Index(['Engli

In [50]:
import pandas as pd
import openai
import random
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

openai.api_key = 'API KEY'

def regenerate_translation(english_sentence, target_language, previous_translation=None):
   
    if previous_translation:
        prompt = (
            f"Improve the following translation from English to **{target_language}**. "
            f"The previous translation was: '{previous_translation}'. "
            "Please ensure that the new translation is more fluent and accurate.\n\n"
            f"English sentence: {english_sentence}\n"
            f"New Translation in {target_language}:"
        )
    else:
        prompt = (
            f"Translate the following English sentence into **{target_language} only**, ensuring the translation is natural and fluent. "
            "Please only use one of these languages: Marathi, Bengali, Telugu. "
            "Make sure that the translation is grammatically correct and contextually appropriate.\n\n"
            f"English sentence: {english_sentence}\n"
            f"Translation in {target_language}:"
        )

    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a multilingual translator."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=200,
            temperature=random.uniform(0.2, 0.3),  # Randomly change the temperature
        )
        return response['choices'][0]['message']['content'].strip()
    except Exception as e:
        print(f"Error in translation: {e}")
        return ""

def calculate_bleu_score(actual_translation, gpt_translation):
    smoothie = SmoothingFunction().method4
    return sentence_bleu([actual_translation.split()], gpt_translation.split(), smoothing_function=smoothie)

def update_translations_with_zero_bleu(input_file, output_file):
    # Load the Excel file
    df = pd.read_excel(input_file)

    # Check the columns' names to adapt if needed
    print("Columns in the Excel file:", df.columns)

    # Ensure required columns exist
    required_columns = ['English', 'Actual_Translation', 'GPT_Translation', 'BLEU_Score']
    for col in required_columns:
        if col not in df.columns:
            print(f"Error: Required column '{col}' not found in the input file.")
            return

    # Identify rows with a BLEU score of 0
    zero_bleu_rows = df[df['BLEU_Score'] == 0]

    if zero_bleu_rows.empty:
        print("No rows with a BLEU score of 0 found.")
        return

    target_language = 'Bengali'  # Set target language for Bengali translations
    max_attempts = 5  # Set maximum attempts to avoid infinite loops

    for index, row in zero_bleu_rows.iterrows():
        english_sentence = row['English']
        actual_translation = row['Actual_Translation']

        # Initialize BLEU score and GPT translation
        new_bleu_score = 0
        gpt_translation = ""
        attempts = 0  # Attempt counter

        # Regenerate translation until BLEU score is greater than zero or max attempts reached
        previous_translation = None  # Track the previous translation
        while new_bleu_score == 0 and attempts < max_attempts:
            attempts += 1
            # Regenerate the translation
            gpt_translation = regenerate_translation(english_sentence, target_language, previous_translation)

            # Calculate the new BLEU score
            if gpt_translation:
                new_bleu_score = calculate_bleu_score(actual_translation, gpt_translation)

                # Print the attempt
                print(f"Attempt {attempts}: {english_sentence} -> {gpt_translation} | New BLEU Score: {new_bleu_score}")
                
                # Update previous translation for the next attempt
                previous_translation = gpt_translation
            else:
                print(f"Failed to regenerate translation for: {english_sentence}")
                break  # Exit if regeneration fails

        # Update the DataFrame if we have a valid translation and BLEU score is greater than zero
        if new_bleu_score > 0:
            df.at[index, 'GPT_Translation'] = gpt_translation
            df.at[index, 'BLEU_Score'] = new_bleu_score

            print(f"Updated: {english_sentence} -> {gpt_translation} | Final BLEU Score: {new_bleu_score}")
        else:
            print(f"Max attempts reached for: {english_sentence}. Keeping previous translation.")

    # Save the updated DataFrame back to an Excel file
    df.to_excel(output_file, index=False)
    print(f"Updated data saved to: {output_file}")

# Run the update for the Bengali evaluation file only
input_file = r'D:\Machine translation model\Machine Translation Model\OUTPUT MT\evaluation_eng_ben_gpt4ominiw_updated.xlsx'
output_file = r'D:\Machine translation model\Machine Translation Model\OUTPUT MT\evaluation_eng_ben_gpt4ominiw_updated(1).xlsx'

update_translations_with_zero_bleu(input_file, output_file)

Columns in the Excel file: Index(['English', 'Actual_Translation', 'GPT_Translation', 'BLEU_Score'], dtype='object')
Attempt 1: During his trip, Iwasaki ran into trouble on many occasions. -> তার সফরের সময়, ইওয়াসাকি অনেকবার সমস্যায় পড়েছিল। | New BLEU Score: 0
Attempt 2: During his trip, Iwasaki ran into trouble on many occasions. -> তার সফরের সময়, ইওয়াসাকি অনেকবার সমস্যায় পড়েছিল। 

Improved Translation in Bengali: তার সফরের সময়, ইওয়াসাকির অনেকবার সমস্যার সম্মুখীন হতে হয়েছে। | New BLEU Score: 0.013038525962482981
Updated: During his trip, Iwasaki ran into trouble on many occasions. -> তার সফরের সময়, ইওয়াসাকি অনেকবার সমস্যায় পড়েছিল। 

Improved Translation in Bengali: তার সফরের সময়, ইওয়াসাকির অনেকবার সমস্যার সম্মুখীন হতে হয়েছে। | Final BLEU Score: 0.013038525962482981
Updated data saved to: D:\Machine translation model\Machine Translation Model\OUTPUT MT\evaluation_eng_ben_gpt4ominiw_updated(1).xlsx
