# Evaluate System Response and LLM(Baseline) Response

In [1]:
# open csv from particiant_scores/Human Evaluation Score.csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import argparse
from statsmodels.stats.inter_rater import fleiss_kappa
from krippendorff import alpha as krippendorff_alpha
from cerebras.cloud.sdk import Cerebras
from logger import logger

# Route to directory
os.chdir('/home/tahlilmahfuz/RAG_Implementation/Evaluation/participant_scores')
df = pd.read_csv('Human Evaluation Scores.csv')
df

Unnamed: 0,reviewer_id,review,system_response,system_response_accuracy,system_response_grammatical_correctness,system_response_Relevancy,system_response_specificity,llm_baseline_response,llm_baseline_response_accuracy,llm_baseline_response_grammatical_correctness,llm_baseline_response_relevancy,llm_baseline_response_specificity
0,200042150,Great app. Easy to navigate and understand.,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,4.0,5.0,5.0,4.0,Thank you so much for your kind words! We're t...,5.0,5.0,5.0,5.0
1,200042150,"Wow it very good, very helpful I like this so ...",Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,5.0,4.0,4.0,5.0,Thank you so much for your kind words! 🌟 We’re...,4.0,4.0,4.0,4.0
2,200042150,Very useful to me. Very greatfull all admins a...,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,5.0,5.0,5.0,5.0,**السلام عليكم ورحمة الله وبركاته** \n\nThank...,3.0,3.5,4.0,4.5
3,200042150,It's a very wonderful islamic app which has bo...,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,4.5,5.0,4.5,5.0,Thank you so much for your wonderful feedback!...,5.0,5.0,4.0,4.0
4,200042150,Masha Allah Tabarakallah to whoever developed ...,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,4.5,5.0,5.0,4.5,**Assalamu Alaykum wa Rahmatullahi wa Barakatu...,5.0,4.5,5.0,4.5
5,200042150,Very good for understanding Quran,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,5.0,4.5,4.0,4.0,Thank you for your kind words! We're delighted...,4.5,4.5,5.0,5.0
6,200042150,"Best aap of Al Quran, it has word by word tran...",Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,4.0,4.0,4.0,4.0,**Assalamu Alaikum wa Rahmatullahi wa Barakatu...,5.0,5.0,5.0,5.0
7,200042150,Would like to have options like audio translat...,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,4.5,5.0,5.0,5.0,Thank you for sharing your suggestion! We appr...,5.0,4.5,4.0,3.5
8,200042150,Easy to understand with word by word meaning i...,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,4.0,4.0,4.5,4.5,Thank you for your kind words! 🌙 We're delight...,5.0,4.0,4.5,2.5
9,200042150,Wonderful app for anyone interested in learnin...,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,5.0,5.0,5.0,5.0,Thank you so much for your heartfelt review! ...,5.0,4.5,5.0,5.0


In [3]:
# Define columns for system and LLM responses
system_cols = [
    'system_response_accuracy',
    'system_response_grammatical_correctness',
    'system_response_Relevancy',
    'system_response_specificity'
]
llm_cols = [
    'llm_baseline_response_accuracy',
    'llm_baseline_response_grammatical_correctness',
    'llm_baseline_response_relevancy',
    'llm_baseline_response_specificity'
]
categories = ['Accuracy', 'Grammatical Correctness', 'Relevancy', 'Specificity']

results = []

for sys_col, llm_col, cat in zip(system_cols, llm_cols, categories):
    sys_mean = df[sys_col].mean()
    sys_std = df[sys_col].std()
    llm_mean = df[llm_col].mean()
    llm_std = df[llm_col].std()
    results.append((cat, sys_mean, sys_std, llm_mean, llm_std))

results_df = pd.DataFrame(
    results,
    columns=['Category', 'System Mean', 'System Std', 'LLM Mean', 'LLM Std']
)

# Print as markdown table
# print("|Category|System Mean ± Std|LLM Mean ± Std|")
# print("|---|---|---|")
# for _, row in results_df.iterrows():
#     print(f"| {row['Category']} | {row['System Mean']:.2f} ± {row['System Std']:.2f} | {row['LLM Mean']:.2f} ± {row['LLM Std']:.2f} |")

results_df

Unnamed: 0,Category,System Mean,System Std,LLM Mean,LLM Std
0,Accuracy,4.6,0.443342,4.655172,0.613879
1,Grammatical Correctness,4.633333,0.472217,4.517241,0.452824
2,Relevancy,4.55,0.530939,4.534483,0.461578
3,Specificity,4.55,0.479763,4.431034,0.637015


In [4]:
# Calculate which system is better overall (across all criteria and reviews)

system_better = 0
llm_better = 0
equal = 0
total = 0

for sys_col, llm_col in zip(system_cols, llm_cols):
    for sys_score, llm_score in zip(df[sys_col], df[llm_col]):
        if pd.isna(sys_score) or pd.isna(llm_score):
            continue
        total += 1
        if sys_score > llm_score:
            system_better += 1
        elif llm_score > sys_score:
            llm_better += 1
        else:
            equal += 1

if total > 0:
    system_pct = 100 * system_better / total
    llm_pct = 100 * llm_better / total
    equal_pct = 100 * equal / total
else:
    system_pct = llm_pct = equal_pct = 0

if system_better > llm_better:
    print(f"System response is better in {system_pct:.1f}% of cases.")
    print(f"LLM response is better in {llm_pct:.1f}% of cases.")
elif llm_better > system_better:
    print(f"LLM response is better in {llm_pct:.1f}% of cases.")
    print(f"System response is better in {system_pct:.1f}% of cases.")
else:
    print(f"Both systems are equally good in {equal_pct:.1f}% of cases.")

print(f"Equal ratings: {equal_pct:.1f}%")

System response is better in 37.9% of cases.
LLM response is better in 29.3% of cases.
Equal ratings: 32.8%


# LLM As A Judge

In [25]:
# open csv from particiant_scores/Human Evaluation Score.csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import argparse
from statsmodels.stats.inter_rater import fleiss_kappa
from krippendorff import alpha as krippendorff_alpha
from cerebras.cloud.sdk import Cerebras
from logger import logger

# Load the CSV file
df = pd.read_csv('Only Human Evaluation Scores.csv')
df.head()

Unnamed: 0,reviewer_id,review,response_1,accuracy,grammatical_correctness,relevancy,specificity
0,200042150,Great app. Easy to navigate and understand.,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,4.0,5.0,5.0,4.0
1,200042150,"Wow it very good, very helpful I like this so ...",Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,5.0,4.0,4.0,5.0
2,200042150,Very useful to me. Very greatfull all admins a...,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,5.0,5.0,5.0,5.0
3,200042150,It's a very wonderful islamic app which has bo...,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,4.5,5.0,4.5,5.0
4,200042150,Masha Allah Tabarakallah to whoever developed ...,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,4.5,5.0,5.0,4.5


In [26]:
# Load all available API keys
api_keys = [os.getenv(f"CEREBRAS_API_KEY_{i}") for i in range(7)]

api_keys = [key for key in api_keys if key]
print(f"Found {len(api_keys)} Cerebras API keys.")

# print(api_keys)
if not api_keys:
    raise ValueError("No Cerebras API keys found in environment variables")

current_api_key_index = 0
client = Cerebras(api_key=api_keys[current_api_key_index])
model = "gpt-oss-120b"
print(api_keys)

Found 6 Cerebras API keys.
[08/Sep/2025 09:52:17] INFO - HTTP Request: GET https://api.cerebras.ai/v1/tcp_warming "HTTP/1.1 200 OK"
['csk-x2kewhvre2y3cvt859wckjvh66wcn3n3k3nwcn5n4kem8exc', 'csk-pjfmp5jn93rmvmn2hcprvfjd5ck59fcfxevtpd3c4ycvvktx', 'csk-n328539mn5h84f22jyw9n3r2vvwypd6hwy8262wmkftnke4f', 'csk-nhh664njyv366pvyyn8v8h42jhncxkhw5mvmk8pdvt5yfdvk', 'csk-edvj62mc95848jm8dwmp3e2kkc2dn5xmyx4t8n8c32ntec8w', 'csk-24tkwkcyjfjy4vmynjw6j45d3m53rvjpw2chmyfwwk5fytm8']


In [27]:
def cerebras_invoke(prompt: str, model="gpt-oss-120b") -> str:
    global client, current_api_key_index, api_keys
    while True:
        try:
            response = client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model=model,
                temperature=0.4
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            error_msg = e.__str__()
            if "request_quota_exceeded" in error_msg:
                logger.error(f"Request quota exceeded for key serial {current_api_key_index}.")
                current_api_key_index += 1
                logger.info(f"Switching to API key serial {current_api_key_index}.")
            elif "token_quota_exceeded" in error_msg:
                logger.error(f"Token quota exceeded for key serial {current_api_key_index}. Please check your API key limits.")
                api_keys.pop(current_api_key_index)
            else:
                logger.info(f"Switching to API key serial {current_api_key_index} due to an error.")
                logger.error(f"An error occurred: {error_msg}")
                raise e

            if current_api_key_index >= len(api_keys):
                current_api_key_index = 0
            if len(api_keys) == 0:
                raise ValueError("All Cerebras API keys have exceeded their quotas or are invalid.")
            client = Cerebras(api_key=api_keys[current_api_key_index])

In [28]:
import pandas as pd

# Define the LLM judge prompt template
def build_llm_judge_prompt(review, response):
    return f"""
You are an expert evaluator for Islamic app review responses. Please rate the following response to a user review on a scale of 1 (poor) to 5 (excellent) for each criterion below. Only provide the number for each criterion, nothing else.

Criteria:
1. Accuracy: Are the facts, etiquette, and cultural references correct??
2. Grammatical Correctness: Is the writing fluent, grammatical, and well-structured?
3. Relevancy: Does the response actually address the review content?
4. Application Specificity: Does it stay relevant to the Islamic app (not a generic answer)?

User Review:
{review}

Response:
{response}

Format your answer as:
Accuracy: <1-5>
Grammatical Correctness: <1-5>
Relevancy: <1-5>
Application Specificity: <1-5>
"""

# Prepare columns for LLM ratings
df['llm_accuracy'] = None
df['llm_grammatical_correctness'] = None
df['llm_relevancy'] = None
df['llm_specificity'] = None

for idx, row in df.iterrows():
    prompt = build_llm_judge_prompt(row['review'], row['response_1'])
    llm_output = cerebras_invoke(prompt)
    # Parse the LLM output (expects lines like "Accuracy: 5")
    for line in llm_output.splitlines():
        if "Accuracy:" in line:
            df.at[idx, 'llm_accuracy'] = int(line.split(":")[1].strip())
        elif "Grammatical Correctness:" in line:
            df.at[idx, 'llm_grammatical_correctness'] = int(line.split(":")[1].strip())
        elif "Relevancy:" in line:
            df.at[idx, 'llm_relevancy'] = int(line.split(":")[1].strip())
        elif "Application Specificity:" in line:
            df.at[idx, 'llm_specificity'] = int(line.split(":")[1].strip())

[08/Sep/2025 09:53:18] INFO - HTTP Request: POST https://api.cerebras.ai/v1/chat/completions "HTTP/1.1 200 OK"
[08/Sep/2025 09:53:18] INFO - HTTP Request: POST https://api.cerebras.ai/v1/chat/completions "HTTP/1.1 200 OK"
[08/Sep/2025 09:53:19] INFO - HTTP Request: POST https://api.cerebras.ai/v1/chat/completions "HTTP/1.1 200 OK"
[08/Sep/2025 09:53:20] INFO - HTTP Request: POST https://api.cerebras.ai/v1/chat/completions "HTTP/1.1 200 OK"
[08/Sep/2025 09:53:20] INFO - HTTP Request: POST https://api.cerebras.ai/v1/chat/completions "HTTP/1.1 200 OK"
[08/Sep/2025 09:53:21] INFO - HTTP Request: POST https://api.cerebras.ai/v1/chat/completions "HTTP/1.1 200 OK"
[08/Sep/2025 09:53:22] INFO - HTTP Request: POST https://api.cerebras.ai/v1/chat/completions "HTTP/1.1 200 OK"
[08/Sep/2025 09:53:23] INFO - HTTP Request: POST https://api.cerebras.ai/v1/chat/completions "HTTP/1.1 200 OK"
[08/Sep/2025 09:53:24] INFO - HTTP Request: POST https://api.cerebras.ai/v1/chat/completions "HTTP/1.1 200 OK"
[

In [30]:
# Save the DataFrame with LLM ratings
df.to_csv('Human Evaluation Scores with LLM.csv', index=False)

In [12]:
# Load the CSV directly (no need for open())
df = pd.read_csv('./Human Evaluation Scores with LLM.csv')
df

Unnamed: 0,reviewer_id,review,response_1,accuracy,grammatical_correctness,relevancy,specificity,llm_accuracy,llm_grammatical_correctness,llm_relevancy,llm_specificity
0,200042150,Great app. Easy to navigate and understand.,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,4.0,5.0,5.0,4.0,5,4,5,3
1,200042150,"Wow it very good, very helpful I like this so ...",Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,5.0,4.0,4.0,5.0,5,5,5,5
2,200042150,Very useful to me. Very greatfull all admins a...,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,5.0,5.0,5.0,5.0,5,5,5,5
3,200042150,It's a very wonderful islamic app which has bo...,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,4.5,5.0,4.5,5.0,4,4,4,5
4,200042150,Masha Allah Tabarakallah to whoever developed ...,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,4.5,5.0,5.0,4.5,5,4,5,5
5,200042150,Very good for understanding Quran,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,5.0,4.5,4.0,4.0,5,5,5,5
6,200042150,"Best aap of Al Quran, it has word by word tran...",Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,4.0,4.0,4.0,4.0,5,5,5,5
7,200042150,Would like to have options like audio translat...,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,4.5,5.0,5.0,5.0,4,4,5,5
8,200042150,Easy to understand with word by word meaning i...,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,4.0,4.0,4.5,4.5,5,4,5,5
9,200042150,Wonderful app for anyone interested in learnin...,Assalamu Alaikum wa Rahmatullahi wa Barakatuh....,5.0,5.0,5.0,5.0,5,4,5,5


In [31]:
import pandas as pd
import numpy as np
from krippendorff import alpha as krippendorff_alpha
from sklearn.metrics import cohen_kappa_score

# Load the CSV with both human and LLM scores
df = pd.read_csv('Human Evaluation Scores with LLM.csv')

categories = ['accuracy', 'grammatical_correctness', 'relevancy', 'specificity']
llm_categories = ['llm_accuracy', 'llm_grammatical_correctness', 'llm_relevancy', 'llm_specificity']

results = []

for cat, llm_cat in zip(categories, llm_categories):
    # Build a matrix: rows=items, columns=raters (human, LLM)
    ratings = pd.DataFrame({
        'human': df[cat].values,
        'llm': df[llm_cat].values
    })
    # Krippendorff expects shape (raters, items)
    ratings_matrix = ratings.T.values

    # Krippendorff's alpha (ordinal)
    ka = krippendorff_alpha(ratings_matrix, level_of_measurement='ordinal')
    # Convert to integers for Cohen's Kappa
    ratings_int = ratings.round().astype(int)
    # Krippendorff expects shape (raters, items)
    ratings_matrix = ratings.T.values
    ka = krippendorff_alpha(ratings_matrix, level_of_measurement='ordinal')
    fk = cohen_kappa_score(ratings_int['human'], ratings_int['llm'])
    human_mean, human_std = np.mean(ratings['human']), np.std(ratings['human'])
    llm_mean, llm_std = np.mean(ratings['llm']), np.std(ratings['llm'])
    results.append((cat.title().replace('_', ' '), fk, ka, human_mean, human_std, llm_mean, llm_std))
# Create a DataFrame with the results
results_df = pd.DataFrame(
    results,
    columns=[
        'Category', "Cohen's Kappa", 'Krippendorff Alpha', 'Human Mean', 'Human Std', 'LLM Mean', 'LLM Std'
    ]
)

print("|Category|Cohen's Kappa|Krippendorff Alpha|Human Mean ± Std|LLM Mean ± Std|")
print("|---|---|---|---|---|")
for _, row in results_df.iterrows():
    print(f"| {row['Category']} | {row['Cohen\'s Kappa']:.2f} | {row['Krippendorff Alpha']:.2f} | {row['Human Mean']:.2f} ± {row['Human Std']:.2f} | {row['LLM Mean']:.2f} ± {row['LLM Std']:.2f} |")

results_df

|Category|Cohen's Kappa|Krippendorff Alpha|Human Mean ± Std|LLM Mean ± Std|
|---|---|---|---|---|
| Accuracy | 0.06 | 0.05 | 4.72 ± 0.42 | 4.82 ± 0.50 |
| Grammatical Correctness | -0.22 | -0.30 | 4.52 ± 0.52 | 4.32 ± 0.47 |
| Relevancy | -0.04 | -0.16 | 4.54 ± 0.60 | 4.92 ± 0.28 |
| Specificity | 0.00 | -0.28 | 4.53 ± 0.58 | 5.00 ± 0.00 |


Unnamed: 0,Category,Cohen's Kappa,Krippendorff Alpha,Human Mean,Human Std,LLM Mean,LLM Std
0,Accuracy,0.057143,0.053232,4.716667,0.421966,4.816667,0.499722
1,Grammatical Correctness,-0.223833,-0.30279,4.516667,0.52414,4.316667,0.465176
2,Relevancy,-0.037037,-0.159193,4.541667,0.600636,4.916667,0.276385
3,Specificity,0.0,-0.276305,4.525,0.580409,5.0,0.0


# Fleiss Kappa and Kripendoffs alpha(Mukit,Sadaf,Shoyeb,Tanzim)