# Evaluate response similarity scoring logic
This script uses a custom labelled dataset to evaluate the performance of the prompt-response similarity scoring (which uses LLM as a judge).
This measures how well our judge LLM is able to determine the intent similarity between a prompt and a response, compared against human labelled ground truths.

In [5]:
import pandas as pd
from tqdm import tqdm
import sys
import os

project_root = os.path.abspath("../../")
sys.path.append(project_root)
from reward_model.strongreject_score import calculate_strongreject_rubric_score


In [6]:
df = pd.read_csv(f"{project_root}/../MISC/datasets/custom/response_similarity_eval_dataset.csv")

In [None]:
scores = []
ground_truth_scores = []

results_df = pd.DataFrame(columns=[
    'prompt', 'response', 'predicted_score', 'ground_truth'
])

for _, row in tqdm(df.iterrows(), total=len(df)):
    prompt = row['prompt']
    response = row['response']
    
    # Calculate reward using our function
    while True:
        try:
            reward_score = calculate_strongreject_rubric_score(prompt, response)
            break
        except Exception as e:
            print(f"Error in StrongREJECT: {e}")
    scores.append(reward_score)
    
    # Calculate ground truth (average of score1 and score2)
    ground_truth = row["avg_normalised"]
    ground_truth_scores.append(ground_truth)

    results = {
        'prompt': prompt,
        'response': response,
        'predicted_score': reward_score,
        'ground_truth': ground_truth
    }
    results_df = results_df._append(results, ignore_index=True)

  results_df = results_df._append(results, ignore_index=True)
  8%|▊         | 8/99 [00:16<03:15,  2.15s/it]

In [None]:
correlation = results_df['predicted_score'].corr(results_df['ground_truth'])
mae = abs(results_df['predicted_score'] - results_df['ground_truth']).mean()
print(f"Correlation: {correlation:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")