### run evaluations on checklists of week 4 and week 5 versions

In [1]:
from analyze import *
from modules.llm_eval.consistency_eval import *

In [2]:
report_output_path_html = '../../report/evaluation_report.html'
report_output_path_pdf = '../../report/evaluation_report.pdf'

In [3]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
repo = Repository('../../data/raw/openja/lightfm')
prompt_format = EvaluationPromptFormat()

checklist_v1 = Checklist('archive/checklist_20240527.csv', checklist_format=ChecklistFormat.CSV)
checklist_v2 = Checklist('../../checklist/checklist.csv', checklist_format=ChecklistFormat.CSV)

evaluator_v1 = PerFileTestEvaluator(llm, prompt_format=prompt_format, repository=repo, checklist=checklist_v1)
evaluator_v2 = PerFileTestEvaluator(llm, prompt_format=prompt_format, repository=repo, checklist=checklist_v2)

In [4]:
eval_evaluator = ConsistencyEvaluator()

RUNS = 20 # TODO: recommended 20 ~ 30

eval_evaluator.evaluate(models=[
    {'name': '1. original', 'model': evaluator_v1},
    {'name': '2. after rewording Requirement', 'model': evaluator_v2},
], num_test_runs=RUNS) 

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:38<00:00, 14.11s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:35<00:00, 13.61s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [02:29<00:00, 21.39s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [04:05<00:00, 35.03s/it]
100%|███████████████████████████████████████

In [None]:
eval_evaluator.get_completeness_score_dist()

### compare consistency 

In [6]:
completeness_score_var = pd.DataFrame(eval_evaluator.get_completeness_score_dist().var(axis=1), columns=['sample variance'])
completeness_score_var

Unnamed: 0_level_0,sample variance
model_name,Unnamed: 1_level_1
1. original,0.014232
2. after rewording Requirement,0.009278


In [11]:
F = completeness_score_var['sample variance'].iloc[0] / completeness_score_var['sample variance'].iloc[1]
F

1.5340086830680175

In [12]:
import scipy
tail = 2
alpha = 0.05 #Or whatever you want your alpha to be.
p_value = 1 - scipy.stats.f.cdf(F, RUNS-1, RUNS-1) 

print(f"p-value: {p_value}")
print()

print(f"{tail}-tail test:")
if p_value < alpha / 2:
    print("  Reject the null hypothesis: Var(Completeness_Score(Current Version)) == Var(Completeness_Score(Last Week Version))")
else:
    print("  Failed to reject the null hypothesis: Var(Completeness_Score(Current Version)) == Var(Completeness_Score(Last Week Version))")

p-value: 0.17958707800932472

2-tail test:
  Failed to reject the null hypothesis: Var(Completeness_Score(Current Version)) == Var(Completeness_Score(Last Week Version))


### compare accuracy

In [9]:
# Human evaluated answer
human_answer = pd.DataFrame([
    {'ID': '2.1', "human_answer": 1.0}, 
    {'ID': '3.2', "human_answer": 0.5},
    {'ID': '3.5', "human_answer": 0.0},
    {'ID': '4.2', "human_answer": 1.0},
    {'ID': '5.3', "human_answer": 0.5},
    {'ID': '6.1', "human_answer": 1.0},
    {'ID': '6.2', "human_answer": 1.0},
])
# reference: https://github.com/UBC-MDS/test-creation/blob/main/report/repo_human_evaluation/human_evaluation_report-lightfm.md

In [10]:
df = eval_evaluator.get_consistency_dist().reset_index().drop(columns=['consistency'])
df = df.merge(human_answer, how='left', on='ID')

checklist_item_title = pd.DataFrame(checklist_v2.get_all_tests())[['ID', 'Title']]
df = df.merge(checklist_item_title, how='left', on='ID')
df

Unnamed: 0,model_name,ID,1,2,3,4,5,6,7,8,...,13,14,15,16,17,18,19,20,human_answer,Title
0,1. original,2.1,0.0,0.0,0.5,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.5,1.0,1.0,0.0,0.0,1.0,Ensure Data File Loads as Expected
1,1. original,3.2,0.5,0.5,0.5,1.0,1.0,0.5,0.5,1.0,...,1.0,0.5,1.0,0.5,1.0,1.0,0.5,0.5,0.5,Data in the Expected Format
2,1. original,3.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Check for Duplicate Records in Data
3,1. original,4.2,0.5,1.0,1.0,0.5,0.5,1.0,1.0,0.5,...,0.5,0.5,0.5,1.0,0.5,0.5,0.5,1.0,1.0,Verify Data Split Proportion
4,1. original,5.3,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,Ensure Model Output Shape Aligns with Expectation
5,1. original,6.1,0.5,0.5,0.5,1.0,1.0,0.5,1.0,1.0,...,1.0,1.0,1.0,0.5,1.0,1.0,0.5,0.5,1.0,Verify Evaluation Metrics Implementation
6,1. original,6.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.5,1.0,1.0,Evaluate Model's Performance Against Thresholds
7,2. after rewording Requirement,2.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,Ensure Data File Loads as Expected
8,2. after rewording Requirement,3.2,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,Data in the Expected Format
9,2. after rewording Requirement,3.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Check for Duplicate Records in Data


In [14]:
df[['model_name', 'ID', 'Title', 1, 2, 3, 4, 5, 'human_answer']]

Unnamed: 0,model_name,ID,Title,1,2,3,4,5,human_answer
0,1. original,2.1,Ensure Data File Loads as Expected,0.0,0.0,0.5,1.0,1.0,1.0
1,1. original,3.2,Data in the Expected Format,0.5,0.5,0.5,1.0,1.0,0.5
2,1. original,3.5,Check for Duplicate Records in Data,0.0,0.0,0.0,0.0,1.0,0.0
3,1. original,4.2,Verify Data Split Proportion,0.5,1.0,1.0,0.5,0.5,1.0
4,1. original,5.3,Ensure Model Output Shape Aligns with Expectation,0.0,0.0,0.0,0.0,0.0,0.5
5,1. original,6.1,Verify Evaluation Metrics Implementation,0.5,0.5,0.5,1.0,1.0,1.0
6,1. original,6.2,Evaluate Model's Performance Against Thresholds,1.0,1.0,1.0,1.0,1.0,1.0
7,2. after rewording Requirement,2.1,Ensure Data File Loads as Expected,1.0,1.0,1.0,1.0,1.0,1.0
8,2. after rewording Requirement,3.2,Data in the Expected Format,0.5,0.5,0.0,0.5,0.5,0.5
9,2. after rewording Requirement,3.5,Check for Duplicate Records in Data,1.0,0.0,0.0,0.0,0.0,0.0


### reference
https://www.itl.nist.gov/div898/handbook/eda/section3/eda359.html  
https://www.statisticshowto.com/probability-and-statistics/hypothesis-testing/f-test/#:~:text=F%20Test%20to%20Compare%20Two%20Variances,-A%20Statistical%20F&text=If%20the%20variances%20are%20equal,when%20running%20an%20F%20Test  
https://stackoverflow.com/questions/21494141/how-do-i-do-a-f-test-in-python