In [50]:
import os
import pandas as pd

def load_devtest_files(devtest_directory, specific_files):
    all_data = {}

    for file in specific_files:
        file_path = os.path.join(devtest_directory, file)
        
        if os.path.exists(file_path):
            try:
 
                with open(file_path, 'r', encoding='utf-8') as f:
                    sentences = f.readlines()
                sentences = [sentence.strip() for sentence in sentences if sentence.strip()]  # Remove empty lines and strip whitespace

                all_data[file] = sentences
                print(f"Loaded: {file} with {len(sentences)} sentences")
            except Exception as e:
                print(f"Error loading {file}: {e}")
        else:
            print(f"File not found: {file}")

    return all_data


devtest_directory = r'D:\Machine translation model\flores200_dataset\flores200_dataset\devtest'
specific_files = [
    'eng_Latn.devtest',
    'hin_Deva.devtest',
    'mar_Deva.devtest',
    'guj_Gujr.devtest',
    'ben_Beng.devtest',
    'tel_Telu.devtest',
    'tam_Taml.devtest',
]


loaded_data = load_devtest_files(devtest_directory, specific_files)

Loaded: eng_Latn.devtest with 1012 sentences
Loaded: hin_Deva.devtest with 1012 sentences
Loaded: mar_Deva.devtest with 1012 sentences
Loaded: guj_Gujr.devtest with 1012 sentences
Loaded: ben_Beng.devtest with 1012 sentences
Loaded: tel_Telu.devtest with 1012 sentences
Loaded: tam_Taml.devtest with 1012 sentences


In [53]:
def save_random_samples_to_excel(loaded_data, sample_size=150):
    english_sentences = loaded_data['eng_Latn.devtest']
    
    output_directory = r'D:\Machine translation model\Machine Translation Model\OUTPUT MT'

    sample_size = min(sample_size, len(english_sentences))

    for lang_file, lang_sentences in loaded_data.items():
        if lang_file != 'eng_Latn.devtest':
    
            min_len = min(len(english_sentences), len(lang_sentences))
            if min_len < sample_size:
                print(f"Not enough sentences to sample for {lang_file}. Available: {min_len}")
                continue
            
            english_sample = pd.Series(english_sentences).sample(n=sample_size, random_state=42).reset_index(drop=True)
            lang_sample = pd.Series(lang_sentences).sample(n=sample_size, random_state=42).reset_index(drop=True)

            parallel_df = pd.DataFrame({
                'English': english_sample,
                'Translation': lang_sample
            })

            lang_code = lang_file.split('_')[0]
            excel_filename = os.path.join(output_directory, f'parallel_dataset_eng_{lang_code}.xlsx')
            parallel_df.to_excel(excel_filename, index=False)
            print(f'Saved: {excel_filename} with {len(parallel_df)} entries.')
save_random_samples_to_excel(loaded_data)


Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_hin.xlsx with 150 entries.
Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_mar.xlsx with 150 entries.
Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_guj.xlsx with 150 entries.
Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_ben.xlsx with 150 entries.
Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_tel.xlsx with 150 entries.
Saved: D:\Machine translation model\Machine Translation Model\OUTPUT MT\parallel_dataset_eng_tam.xlsx with 150 entries.


In [None]:
import pandas as pd

def create_parallel_datasets(languages, combined_df):
    parallel_datasets = {}
    for lang in languages:
        eng_data = combined_df[combined_df['source_file'] == 'eng_Latn.devtest'].reset_index(drop=True)
        lang_data = combined_df[combined_df['source_file'] == lang].reset_index(drop=True)
        min_len = min(len(eng_data), len(lang_data))
        eng_data = eng_data.iloc[:min_len]
        lang_data = lang_data.iloc[:min_len]

        parallel_df = pd.DataFrame({
            'English': eng_data[0],
            'Translation': lang_data[0] 
        })

        parallel_df.dropna(inplace=True)

        parallel_df = parallel_df.sample(n=min(150, len(parallel_df)), random_state=42)

        parallel_datasets[lang] = parallel_df

    return parallel_datasets

languages = [
    'hin_Deva.devtest',
    'mar_Deva.devtest',
    'guj_Gujr.devtest',
    'ben_Beng.devtest',
    'tel_Telu.devtest',
    'tam_Taml.devtest'
]

parallel_datasets = create_parallel_datasets(languages, combined_df)

for lang, df in parallel_datasets.items():
    lang_code = lang.split('_')[0]
    excel_filename = f'randomMT_eng_{lang_code}.xlsx'
    df.to_excel(excel_filename, index=False)
    print(f'Saved: {excel_filename}')

In [59]:
import openai

openai.api_key = 'Your APi key'

def evaluate_translation(english_sentence, target_language):
    prompt = f"Translate the following sentence to {target_language}:\n\n{english_sentence}\n\nTranslation:"
    
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=60
    )

    return response['choices'][0]['message']['content'].strip()

In [None]:
def evaluate_parallel_datasets(loaded_data):
    results = {}

    for lang_file, lang_sentences in loaded_data.items():
        if lang_file != 'eng_Latn.devtest':
            lang_code = lang_file.split('_')[0]
            results[lang_code] = []
            for english_sentence, lang_translation in zip(loaded_data['eng_Latn.devtest'], lang_sentences):
                evaluated_translation = evaluate_translation(english_sentence, lang_code)
                results[lang_code].append({
                    'English': english_sentence,
                    'Translation': lang_translation,
                    'GPT_Translation': evaluated_translation
                })

            print(f"Evaluated translations for {lang_code}: {len(results[lang_code])} sentences.")

    return results
evaluation_results = evaluate_parallel_datasets(loaded_data)


def save_evaluation_results(evaluation_results):
    for lang_code, evaluations in evaluation_results.items():
        df = pd.DataFrame(evaluations)
        excel_filename = f'evaluation_results_{lang_code}.xlsx'
        df.to_excel(excel_filename, index=False)
        print(f'Saved evaluation results to: {excel_filename}')

# Save the evaluation results
save_evaluation_results(evaluation_results)