# Prepare Manual Evaluation and Compare

## Config

In [None]:
generateCSVwithSamples = False
merge_csv_files = False
llm_vs_manual = True
llm_to_compare = "gemini" # "prometheus" or "gemini"

# Read config json file
import json
import os
import sys
import pandas as pd
config_file = "config.json"
if not os.path.exists(config_file):
    print(f"Config file not found: {config_file}")
    sys.exit(1)
else:
    with open(config_file, 'r') as file:
        config = json.load(file)

models = config.get('models', [])
prompts = config.get('prompts', [])


## Prepare Manual eval.

In [None]:
import os
import pandas as pd


folder_federico_path = "csvFiles/manual_evaluate_translations_federico/"
folder_gianmarco_path = "csvFiles/manual_evaluate_translations_gianmarco/"

if generateCSVwithSamples:
    input_folder_path = "csvFiles/translations/"

    os.makedirs(folder_federico_path, exist_ok=True)
    os.makedirs(folder_gianmarco_path, exist_ok=True)

    for filename in os.listdir(input_folder_path):
        if filename.endswith('.csv'):
            input_path = os.path.join(input_folder_path, filename)
            df = pd.read_csv(input_path)
            
            # Extract first 10 rows
            first_10 = df.head(10)
            output_federico = os.path.join(folder_federico_path, filename)
            first_10.to_csv(output_federico, index=False)
            
            # Extract last 10 rows
            last_10 = df.tail(10)
            output_gianmarco = os.path.join(folder_gianmarco_path, filename)
            last_10.to_csv(output_gianmarco, index=False)


In [None]:
# Merge the csv files from both folders into a single CSV file
def merge_csv_files(folder_path1, folder_path2):
    merged_df = pd.DataFrame()
    
    for filename in os.listdir(folder_path1):
        if filename.endswith('.csv'):
            if filename in os.listdir(folder_path2):
                path1 = os.path.join(folder_path1, filename)
                path2 = os.path.join(folder_path2, filename)
                
                df1 = pd.read_csv(path1)
                df2 = pd.read_csv(path2)
                
                # Merge the two dataframes
                merged_df = pd.concat([merged_df, df1, df2], ignore_index=True)
                # Save as csv
                #new file name
                filename = filename.replace('translation', 'scored')
                output_path = os.path.join("csvFiles/scores_manual/", filename)
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
                merged_df.to_csv(output_path, index=False)
                print(f"Merged {filename} from both folders and saved to {output_path}")
                # Clear the merged_df for the next file
                merged_df = pd.DataFrame()

if merge_csv_files:
    # Merge the CSV files from both folders
    merged_df = merge_csv_files(folder_federico_path, folder_gianmarco_path)


## Compare Manual vs LLM-Judge scores


In [None]:
import numpy as np

manual_eval_path = "csvFiles/scores_manual/"
llm_eval_path ="csvFiles/scores_prometheus/"

def compare_scores(manual_eval_path, llm_eval_path):
    # Check if the manual evaluation path exists
    if not os.path.exists(manual_eval_path):
        print(f"Manual evaluation path does not exist: {manual_eval_path}")
        return
    
    # Check if the LLM evaluation path exists
    if not os.path.exists(llm_eval_path):
        print(f"LLM evaluation path does not exist: {llm_eval_path}")
        return

    # dictiornary for each model and prompt in config there is a value counter, initialize that as 0
    counters = {model: {prompt: 0 for prompt in prompts} for model in models}
    # Variance for each model and prompt
    scores_variance = {model: {prompt: [] for prompt in prompts} for model in models}
    print("Counters initialized:", counters)

    if llm_vs_manual:
        # Load the manual evaluation scores
        manual_files = [f for f in os.listdir(manual_eval_path) if f.endswith('.csv')]
        llm_files = [f for f in os.listdir(llm_eval_path) if f.endswith('.csv')]

        for manual_file in manual_files:
            if manual_file in llm_files:
                manual_df = pd.read_csv(os.path.join(manual_eval_path, manual_file))
                llm_df = pd.read_csv(os.path.join(llm_eval_path, manual_file))
                # Extract model and prompt from the filename (file format is scored_model_prompt.csv)
                file = manual_file.replace('scored_', '').replace('.csv', '')
                model, prompt = file.split('_', 1)
                # Compare the item in manual_df with llm_df
                curr_variance = []
                for index, row in manual_df.iterrows():
                    
                    score_manual = row['Score']
                    
                    # Find the corresponding row in llm_df using the Sentence coulmn
                    llm_row = llm_df[llm_df['Sentence'] == row['Sentence']]
                    
                    if not llm_row.empty:
                        score_llm = llm_row['Score'].values[0]
                        # Compare scores
                        if score_manual == score_llm:
                            counters[model][prompt] += 1
                            # Debug print
                            # print(f"Manual score {score_manual} is equal to LLM score {score_llm} for model {model} and prompt {prompt}")
                        variance = (score_manual - score_llm) ** 2/ 2  # Variance calculation
                        curr_variance.append(variance) 
                        # Debug print
                        # print(f"Manual score {score_manual}, LLM score {score_llm}, Variance: {variance:.4f} for model {model} and prompt {prompt}")
                scores_variance[model][prompt].append(np.mean(curr_variance))
    
    return counters, scores_variance

counters, scores_variance = compare_scores(manual_eval_path, llm_eval_path)


#print counters readable format
print("\nCounters in a readable format:")
for model, prompts in counters.items():
    print(f"\nModel: {model}")
    for prompt, count in prompts.items():
        print(f"  Prompt: {prompt}, Count: {count}")

# Print the variance scores
print("\nVariance scores:")
for model, prompts in scores_variance.items():
    print(f"\nModel: {model}")
    for prompt, variances in prompts.items():
        if variances:
            print(f"  Prompt: {prompt}, Variance: {np.mean(variances):.4f}")
        else:
            print(f"  Prompt: {prompt}, Variance: No data")


# Insert all the data (variance and counters) in a csv Report.csv 
report_data = []


for model, prompts in counters.items():
    for prompt, count in prompts.items():
        #Float the variance for each model and prompt
        variance = np.mean(scores_variance[model][prompt])
        report_data.append({
            'Model': model,
            'Prompt': prompt,
            'Judge' : llm_eval_path.split('_')[1].replace('/', ''),  # Extract judge from path
            'Same_Scores': count,
            'Variance': variance
        })

llm_eval_path = "csvFiles/scores_gemini/"

counters, scores_variance = compare_scores(manual_eval_path, llm_eval_path)

for model, prompts in counters.items():
    for prompt, count in prompts.items():
        #Float the variance for each model and prompt
        variance = np.mean(scores_variance[model][prompt])
        report_data.append({
            'Model': model,
            'Prompt': prompt,
            'Judge' : llm_eval_path.split('_')[1].replace('/', ''),  # Extract judge from path
            'Same_Scores': count,
            'Variance': variance
        })

        

#print counters readable format
print("\nCounters in a readable format:")
for model, prompts in counters.items():
    print(f"\nModel: {model}")
    for prompt, count in prompts.items():
        print(f"  Prompt: {prompt}, Count: {count}")

# Print the variance scores
print("\nVariance scores:")
for model, prompts in scores_variance.items():
    print(f"\nModel: {model}")
    for prompt, variances in prompts.items():
        if variances:
            print(f"  Prompt: {prompt}, Variance: {np.mean(variances):.4f}")
        else:
            print(f"  Prompt: {prompt}, Variance: No data")



report_df = pd.DataFrame(report_data)
report_df.to_csv('csvFiles/report.csv', index=False)



## Kappa coefficient

In [None]:
import os
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import numpy as np

#path to the scores
manual_path = "csvFiles/scores_manual"
gemini_path = "csvFiles/scores_gemini"
prometheus_path = "csvFiles/scores_prometheus"

models = ["cerbero", "gemma", "llama3"]
prompts = ["base", "detailed", "few_shot", "role-based", "teacher_student"]

def compute_kappa_scores(manual_eval_path, llm_eval_path):#functions to compute the kappa scores
    kappa_scores = {model: {} for model in models}

    manual_files = [f for f in os.listdir(manual_eval_path) if f.endswith(".csv")]#take the files of the manual evaluation
    llm_files = [f for f in os.listdir(llm_eval_path) if f.endswith(".csv")]#take the files of the llm evaluation

    for file in manual_files:
        if file in llm_files:
            clean_name = file.replace("scored_", "").replace(".csv", "")
            model, prompt = clean_name.split("_", 1)

            manual_df = pd.read_csv(os.path.join(manual_eval_path, file))
            llm_df = pd.read_csv(os.path.join(llm_eval_path, file))
            #define lists to save the scores
            manual_scores = []
            llm_scores = []

            for _, row in manual_df.iterrows():
                sentence = row["Sentence"]#extract sentence and score from the manual evaluation
                score_manual = row["Score"]

                match = llm_df[llm_df["Sentence"] == sentence]#seacrh the sentence in the llm evaluattion
                if not match.empty:
                    score_llm = match["Score"].values[0]
                    manual_scores.append(score_manual)#append the scores to the lists
                    llm_scores.append(score_llm)

            if len(manual_scores) >= 2:#calculate the kappa score
                kappa = cohen_kappa_score(manual_scores, llm_scores)
                kappa_scores[model][prompt] = round(kappa, 4)

    return kappa_scores

# compute kappa coefficient for both judje
kappa_gemini = compute_kappa_scores(manual_path, gemini_path)
kappa_prometheus = compute_kappa_scores(manual_path, prometheus_path)


# save in jsonl
def save_kappa_jsonl(kappa_dict, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        for model in kappa_dict:
            for prompt in kappa_dict[model]:
                entry = {
                    "model": model,
                    "prompt": prompt,
                    "kappa": kappa_dict[model][prompt]
                }
                f.write(json.dumps(entry) + "\n")

save_kappa_jsonl(kappa_gemini, "kappa_vs_gemini.jsonl")
save_kappa_jsonl(kappa_prometheus, "kappa_vs_prometheus.jsonl")



## Variance

In [None]:
import pandas as pd
import numpy as np
import json

#path to the scores
gemini_path = "csvFiles/final_scores_metrics_gemini.csv"
prometheus_path = "csvFiles/final_scores_metrics_prometheus.csv"
#output file
output_variance_path = "output_folder/variance_by_group.jsonl"


def compute_group_variance(csv_path, source_label):
    df = pd.read_csv(csv_path)

    #take the mean for every model
    model_means = df.groupby("model")["mean_score"].mean()
    model_variance = float(np.var(model_means))#compute the variance for the models

    # take the mean for every prompt
    prompt_means = df.groupby("prompt")["mean_score"].mean()
    prompt_variance = float(np.var(prompt_means))#compute the variance for the prompts

    return {
        "source": source_label,
        "var_model": round(model_variance, 6),
        "var_prompt": round(prompt_variance, 6)
    }

#execute for both evaluators
gemini_result = compute_group_variance(gemini_path, "Gemini")
prometheus_result = compute_group_variance(prometheus_path, "Prometheus")

#write the file in jsonl
with open(output_variance_path, "w", encoding="utf-8") as f:
    f.write(json.dumps(gemini_result) + "\n")
    f.write(json.dumps(prometheus_result) + "\n")

print(f"Results saved at: {output_variance_path}")
