In [18]:
generateCSVwithSamples = False
merge_csv_files = False
llm_vs_manual = True
llm_to_compare = "prometheus" # "prometheus" or "gemini"

# Read config json file
import json
import os
import sys
import pandas as pd
config_file = "config.json"
if not os.path.exists(config_file):
    print(f"Config file not found: {config_file}")
    sys.exit(1)
else:
    with open(config_file, 'r') as file:
        config = json.load(file)

models = config.get('models', [])
prompts = config.get('prompts', [])


In [19]:
import os
import pandas as pd


folder_federico_path = "manual_evaluate_translations_federico/"
folder_gianmarco_path = "manual_evaluate_translations_gianmarco/"

if generateCSVwithSamples:
    input_folder_path = "translations/"

    os.makedirs(folder_federico_path, exist_ok=True)
    os.makedirs(folder_gianmarco_path, exist_ok=True)

    for filename in os.listdir(input_folder_path):
        if filename.endswith('.csv'):
            input_path = os.path.join(input_folder_path, filename)
            df = pd.read_csv(input_path)
            
            # Extract first 10 rows
            first_10 = df.head(10)
            output_federico = os.path.join(folder_federico_path, filename)
            first_10.to_csv(output_federico, index=False)
            
            # Extract last 10 rows
            last_10 = df.tail(10)
            output_gianmarco = os.path.join(folder_gianmarco_path, filename)
            last_10.to_csv(output_gianmarco, index=False)


In [20]:
# Merge the csv files from both folders into a single CSV file
def merge_csv_files(folder_path1, folder_path2):
    merged_df = pd.DataFrame()
    
    for filename in os.listdir(folder_path1):
        if filename.endswith('.csv'):
            if filename in os.listdir(folder_path2):
                path1 = os.path.join(folder_path1, filename)
                path2 = os.path.join(folder_path2, filename)
                
                df1 = pd.read_csv(path1)
                df2 = pd.read_csv(path2)
                
                # Merge the two dataframes
                merged_df = pd.concat([merged_df, df1, df2], ignore_index=True)
                # Save as csv
                #new file name
                filename = filename.replace('translation', 'scored')
                output_path = os.path.join("scores_manual/", filename)
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
                merged_df.to_csv(output_path, index=False)
                print(f"Merged {filename} from both folders and saved to {output_path}")
                # Clear the merged_df for the next file
                merged_df = pd.DataFrame()

if merge_csv_files:
    # Merge the CSV files from both folders
    merged_df = merge_csv_files(folder_federico_path, folder_gianmarco_path)


Merged scored_cerbero_base.csv from both folders and saved to scores_manual/scored_cerbero_base.csv
Merged scored_cerbero_detailed.csv from both folders and saved to scores_manual/scored_cerbero_detailed.csv
Merged scored_cerbero_few_shot.csv from both folders and saved to scores_manual/scored_cerbero_few_shot.csv
Merged scored_cerbero_role-based.csv from both folders and saved to scores_manual/scored_cerbero_role-based.csv
Merged scored_cerbero_teacher_student.csv from both folders and saved to scores_manual/scored_cerbero_teacher_student.csv
Merged scored_gemma_base.csv from both folders and saved to scores_manual/scored_gemma_base.csv
Merged scored_gemma_detailed.csv from both folders and saved to scores_manual/scored_gemma_detailed.csv
Merged scored_gemma_few_shot.csv from both folders and saved to scores_manual/scored_gemma_few_shot.csv
Merged scored_gemma_role-based.csv from both folders and saved to scores_manual/scored_gemma_role-based.csv
Merged scored_gemma_teacher_student.cs

In [21]:
import numpy as np

manual_eval_path = "scores_manual/"
llm_eval_path ="scores_prometheus/"

def compare_scores(manual_eval_path, llm_eval_path):
    # Check if the manual evaluation path exists
    if not os.path.exists(manual_eval_path):
        print(f"Manual evaluation path does not exist: {manual_eval_path}")
        return
    
    # Check if the LLM evaluation path exists
    if not os.path.exists(llm_eval_path):
        print(f"LLM evaluation path does not exist: {llm_eval_path}")
        return

    # dictiornary for each model and prompt in config there is a value counter, initialize that as 0
    counters = {model: {prompt: 0 for prompt in prompts} for model in models}
    # Variance for each model and prompt
    scores_variance = {model: {prompt: [] for prompt in prompts} for model in models}
    print("Counters initialized:", counters)

    if llm_vs_manual:
        # Load the manual evaluation scores
        manual_files = [f for f in os.listdir(manual_eval_path) if f.endswith('.csv')]
        llm_files = [f for f in os.listdir(llm_eval_path) if f.endswith('.csv')]

        for manual_file in manual_files:
            if manual_file in llm_files:
                manual_df = pd.read_csv(os.path.join(manual_eval_path, manual_file))
                llm_df = pd.read_csv(os.path.join(llm_eval_path, manual_file))
                # Extract model and prompt from the filename (file format is scored_model_prompt.csv)
                file = manual_file.replace('scored_', '').replace('.csv', '')
                model, prompt = file.split('_', 1)
                # Compare the item in manual_df with llm_df
                curr_variance = []
                for index, row in manual_df.iterrows():
                    
                    score_manual = row['Score']
                    
                    # Find the corresponding row in llm_df using the Sentence coulmn
                    llm_row = llm_df[llm_df['Sentence'] == row['Sentence']]
                    
                    if not llm_row.empty:
                        score_llm = llm_row['Score'].values[0]
                        # Compare scores
                        if score_manual == score_llm:
                            counters[model][prompt] += 1
                            # print(f"Manual score {score_manual} is equal to LLM score {score_llm} for model {model} and prompt {prompt}")
                        variance = (score_manual - score_llm) ** 2/ 2  # Variance calculation
                        curr_variance.append(variance) 
                        #print variance
                        # print(f"Manual score {score_manual}, LLM score {score_llm}, Variance: {variance:.4f} for model {model} and prompt {prompt}")
                scores_variance[model][prompt].append(np.mean(curr_variance))
    
    return counters, scores_variance

# # Print the final counters
# print("Final counters:")
# for model, prompts in counters.items():
#     for prompt, count in prompts.items():
#         print(f"Model: {model}, Prompt: {prompt}, Count: {count}")


counters, scores_variance = compare_scores(manual_eval_path, llm_eval_path)


#print counters readable format
print("\nCounters in a readable format:")
for model, prompts in counters.items():
    print(f"\nModel: {model}")
    for prompt, count in prompts.items():
        print(f"  Prompt: {prompt}, Count: {count}")

# Print the variance scores
print("\nVariance scores:")
for model, prompts in scores_variance.items():
    print(f"\nModel: {model}")
    for prompt, variances in prompts.items():
        if variances:
            print(f"  Prompt: {prompt}, Variance: {np.mean(variances):.4f}")
        else:
            print(f"  Prompt: {prompt}, Variance: No data")


# Insert all the data (variance and counters) in a csv Report.csv 
report_data = []


for model, prompts in counters.items():
    for prompt, count in prompts.items():
        #Float the variance for each model and prompt
        variance = np.mean(scores_variance[model][prompt])
        report_data.append({
            'Model': model,
            'Prompt': prompt,
            'Judge' : llm_eval_path.split('_')[1].replace('/', ''),  # Extract judge from path
            'Same_Scores': count,
            'Variance': variance
        })

llm_eval_path = "scores_gemini/"

counters, scores_variance = compare_scores(manual_eval_path, llm_eval_path)

for model, prompts in counters.items():
    for prompt, count in prompts.items():
        #Float the variance for each model and prompt
        variance = np.mean(scores_variance[model][prompt])
        report_data.append({
            'Model': model,
            'Prompt': prompt,
            'Judge' : llm_eval_path.split('_')[1].replace('/', ''),  # Extract judge from path
            'Same_Scores': count,
            'Variance': variance
        })

        

#print counters readable format
print("\nCounters in a readable format:")
for model, prompts in counters.items():
    print(f"\nModel: {model}")
    for prompt, count in prompts.items():
        print(f"  Prompt: {prompt}, Count: {count}")

# Print the variance scores
print("\nVariance scores:")
for model, prompts in scores_variance.items():
    print(f"\nModel: {model}")
    for prompt, variances in prompts.items():
        if variances:
            print(f"  Prompt: {prompt}, Variance: {np.mean(variances):.4f}")
        else:
            print(f"  Prompt: {prompt}, Variance: No data")



report_df = pd.DataFrame(report_data)
report_df.to_csv('report.csv', index=False)



Counters initialized: {'cerbero': {'base': 0, 'detailed': 0, 'few_shot': 0, 'role-based': 0, 'teacher_student': 0}, 'gemma': {'base': 0, 'detailed': 0, 'few_shot': 0, 'role-based': 0, 'teacher_student': 0}, 'llama3': {'base': 0, 'detailed': 0, 'few_shot': 0, 'role-based': 0, 'teacher_student': 0}}

Counters in a readable format:

Model: cerbero
  Prompt: base, Count: 6
  Prompt: detailed, Count: 4
  Prompt: few_shot, Count: 4
  Prompt: role-based, Count: 2
  Prompt: teacher_student, Count: 8

Model: gemma
  Prompt: base, Count: 5
  Prompt: detailed, Count: 3
  Prompt: few_shot, Count: 6
  Prompt: role-based, Count: 4
  Prompt: teacher_student, Count: 7

Model: llama3
  Prompt: base, Count: 5
  Prompt: detailed, Count: 7
  Prompt: few_shot, Count: 8
  Prompt: role-based, Count: 8
  Prompt: teacher_student, Count: 8

Variance scores:

Model: cerbero
  Prompt: base, Variance: 1.1500
  Prompt: detailed, Variance: 1.6500
  Prompt: few_shot, Variance: 1.3000
  Prompt: role-based, Variance: 1

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
