diff --git a/environment.yml b/environment.yml index 9eb6e96..cf6ab6f 100644 --- a/environment.yml +++ b/environment.yml @@ -16,6 +16,7 @@ dependencies: - ruamel.yaml=0.18.6 - tectonic=0.15.0 - tqdm=4.66.4 + - scipy=1.13 - pip: - distro==1.9.0 - h11==0.14.0 diff --git a/src/test_creation/archive/README.md b/src/test_creation/archive/README.md new file mode 100644 index 0000000..1f5502c --- /dev/null +++ b/src/test_creation/archive/README.md @@ -0,0 +1,7 @@ +## NOTE + +This `archive/` is for the F-score comparison between the code in week 3 (before refactoring, i.e. old code base, by 2024-05-17) vs. week 4 (after refactoring by 2024-05-24). We have to keep the old code base (archive/analyze.py) and adjust the ConsistencyEvaluator (archive/llm_eval/consistency_eval.py) so that it also works for the old code. + +We want to keep a record of the above comparison in case someone might ask for it. + +We may delete this folder in the future when we are having a comparison between newer versions. For now, we put those related to the demo and the old code base under `archive/` in order not to disturb the latest code base. diff --git a/src/test_creation/archive/analyze.py b/src/test_creation/archive/analyze.py new file mode 100644 index 0000000..2bebd49 --- /dev/null +++ b/src/test_creation/archive/analyze.py @@ -0,0 +1,273 @@ +import json +import pprint +from collections import defaultdict + +import fire +import pandas as pd +from dotenv import load_dotenv +from tqdm import tqdm +from langchain_community.document_loaders import DirectoryLoader, PythonLoader +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder +from langchain_openai import ChatOpenAI +from langchain.chains.combine_documents import create_stuff_documents_chain +from langchain.memory import ChatMessageHistory +from langchain_core.messages import AIMessage, HumanMessage + +from modules.checklist.checklist import Checklist, ChecklistFormat +from modules.code_analyzer.repo import Repository + +load_dotenv() + + +class TestEvaluator: + def __init__(self, repo_path=None): + self.repo = None + self.test_fps = [] # test file paths + self.test_dir_path = '' # test dir path # FIXME: required by `load_test_dir` + self.py_splits = [] + + # FIXME: Tony's "Checklist - After Engineering" version + self.checklist = """ + Each test function should have a clear, descriptive name that accurately reflects the test's purpose and the specific functionality or scenario it examines. + Each test should focus on a single scenario, using only one set of mock data and testing one specific behavior or outcome to ensure clarity and isolate issues. + Assertions within tests should be focused and narrow. Ensure you are only testing relevant behaviors of complex objects and not including unrelated assertions. + Keep any modifications to objects and the corresponding assertions close together in your tests to maintain readability and clearly show the cause-and-effect relationship. + Ensure that data-loading functions correctly load files when they exist and match the expected format, handle non-existent files appropriately, and return the expected results. + Verify that functions for saving data and figures perform write operations correctly, checking that the operation succeeds and the content matches the expected format. + Ensure all data files are non-empty and contain the necessary data required for further analysis or processing tasks. + Verify that the data to be ingested matches the format expected by processing algorithms (like pd.DataFrame for CSVs or np.array for images) and adheres to the expected schema. + Check that data files are free from unexpected null values and identify any outliers that could affect the analysis. Tests should explicitly state if null values are part of expected data. + Test that a fixed input to a function or model produces the expected output, focusing on one verification per test to ensure predictable behavior. + Confirm that the model accepts inputs of the correct shapes and types and produces outputs that meet the expected shapes and types without any errors. + For parametric models, ensure that the model's weights update correctly per training iteration. For non-parametric models, verify that the data fits correctly into the model. + Ensure the shape of the model's output aligns with the expected structure based on the task, such as matching the number of labels in a classification task. + Verify that the model's output values are appropriate for its task, such as outputting probabilities that sum to 1 for classification tasks. + If using gradient descent for training, verify that a single gradient step on a batch of data results in a decrease in the model's training loss. + Confirm that there is no leakage of data between training, validation, and testing sets, or across cross-validation folds, to ensure the integrity of the splits. + """ + self.system_message = [] + self.model = 'gpt-3.5-turbo' + self.temperature = 0 + self.chain = None + + # self.evaluation_message = """ + # Your task is to answer each question in the checklist using only the provided test functions. + # If an answer to the question is provided, it must be annotated with a citation of the test function(s) in the Observation session. + # Then, decide the completion score in a fraction format based on your answers. The denominator should be the number of checklist items. + # Desired format: + # Checklist Evaluation: + # ID: + # Title: + # Requirement: + # Observation: + # Evaluation: Satisfied/Partially Satisfied/Not Satisfied + # Score: (1 for Satisfied / 0.5 for Partially Satisfied / 0 for Not Satisfied) + # Completion Score: Number of satisfied requirements/Number of requirements + # Number of satisfied requirements: + # Number of partially satisfied requirements: + # Number of not satisfied requirements: + # """ + self.evaluation_message = """ + Your task is to answer each question in the checklist using only the provided test functions. + If an answer to the question is provided, it must be annotated with a citation of the test function(s) in the Observation session. + Output a JSON format: + [{ + "ID": + "Title": + "Requirement": + "Observation": + "Functions": [ ... ] + "Evaluation": Satisfied/Partially Satisfied/Not Satisfied + "Score": (1 for Satisfied / 0.5 for Partially Satisfied / 0 for Not Satisfied) + }] + """ + + self.evaluation_result = None + + if repo_path is not None: + self.load_repo(repo_path) + + def load_repo(self, repo_path): + self.repo = Repository(repo_path) + self.test_fps = self.repo.list_test_files()['Python'] + + def load_test_file(self, file_path, overwrite=True): + loader = PythonLoader(file_path) + py = loader.load() + py_splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_documents(py) + + if overwrite: + self.py_splits = py_splits + + return py_splits + + # def load_all_test_files(self): + # self.py_splits = [] + # for fp in self.test_fps: + # self.py_splits += self.load_test_file(fp, overwrite=False) + + def load_test_dir(self, dir_path): + self.test_dir_path = dir_path + + loader = DirectoryLoader( + dir_path, + glob="**/*.py", + show_progress=True, + loader_cls=PythonLoader + ) + docs = loader.load() + + text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0) + self.py_splits = text_splitter.split_documents(docs) + + def load_checklist(self, checklist_path): + raw_checklist = Checklist(checklist_path, checklist_format=ChecklistFormat.CSV) + + checklist = [] + for item in raw_checklist.get_all_tests(): + checklist.append({ + 'ID': item['ID'], + 'Title': item['Title'], + 'Requirement': item['Requirement'] + }) + + self.checklist = json.dumps(checklist).replace('{', '[').replace('}', ']') + + def init_system_message(self): + if len(self.checklist) == 0: + # self.load_checklist() + raise ValueError("Checklist is empty, make sure you have configured the checklist loader right!") + + self.system_message = [ + ("system", + "You are a senior machine learning engineer who specializes in performing Machine Learning system testing. Extract and analyze the test functions from the codes:\n\n{context}"), + ("system", + f"Here is the Machine Learning system testing checklist delimited by triple quotes '''{self.checklist}'''") + ] + + def init_chain(self, system_message=None, model=None): + if system_message is None: + if len(self.system_message) == 0: + self.init_system_message() + system_message = self.system_message + else: + self.system_message = system_message + + if model is None: + model = self.model + else: + self.model = model + + prompt = ChatPromptTemplate.from_messages( + system_message + [ + MessagesPlaceholder(variable_name="messages") + ] + ) + chat = ChatOpenAI(model=model, temperature=self.temperature) + + chain = create_stuff_documents_chain(chat, prompt) + self.chain = chain + return chain + + def get_ai_response(self, message, context, history=None): + if self.chain is None: + self.init_chain() + + if history is None: + history = ChatMessageHistory() + + history.add_user_message(message) + + response = self.chain.invoke({ + "context": context, + "messages": history.messages + }) + history.add_ai_message(response) + + return response, history + + def get_evaluation_response(self, py_splits=None): + if py_splits is None: + py_splits = self.py_splits + + return self.get_ai_response( + message=self.evaluation_message, + context=py_splits + ) + + # FIXME: combine evaluation + # to be tested + def extract_json(self, response, start='[', end=']'): + start_idx = response.index(start) + end_idx = response[::-1].index(end) + if end_idx == 0: + string = response[start_idx:] + else: + string = response[start_idx:-end_idx] + return json.loads(string) + + def evaluate(self, on_file=True, verbose=False): + result = [] + if on_file: + for fp in tqdm(self.test_fps): + if verbose: + print(fp) + self.load_test_file(fp) + if verbose: + print(f"# splits: {len(self.test_fps)}") + response, history = self.get_evaluation_response() # FIXME: it sometimes tests only part of the checklist items + report = self.extract_json(response) + for item in report: + item['file'] = fp + result += [{ + 'file': fp, + 'report': report, + 'history': history + }] + else: + self.load_test_dir(self.test_dir_path) + response, history = self.get_evaluation_response() + report = self.extract_json(response) + for item in report: + item['file'] = self.test_dir_path + result += [{ + 'file': self.test_dir_path, + 'report': report, + 'history': history + }] + + self.evaluation_result = result + return + + def get_completeness_score(self, score_format='fraction', verbose=False): + report_df = pd.DataFrame(self.evaluation_result)['report'].explode('report').apply(pd.Series) + report_df = report_df.groupby(['ID', 'Title']).agg({ + 'Score': ['max', 'count'], + 'Functions': ['sum'] + }) + report_df.columns = ['is_Satisfied', 'n_files_tested', 'functions'] + self.evaluation_report = report_df + + if score_format == 'fraction': + score = f"{report_df['is_Satisfied'].sum()}/{report_df['is_Satisfied'].count()}" + elif score_format == 'number': + score = report_df['is_Satisfied'].sum()/report_df['is_Satisfied'].count() + + if verbose: + print("Report:") + print(report_df) + print() + print(f'Score: {score}') + print() + return score + + +if __name__ == '__main__': + def main(checklist_path, repo_path): + test = TestEvaluator(repo_path) + test.load_checklist(checklist_path) + test.evaluate() + test.get_completeness_score() + + fire.Fire(main) diff --git a/src/test_creation/archive/checklist_sys.csv/overview.csv b/src/test_creation/archive/checklist_sys.csv/overview.csv new file mode 100644 index 0000000..5ba2a5f --- /dev/null +++ b/src/test_creation/archive/checklist_sys.csv/overview.csv @@ -0,0 +1,2 @@ +Title,Description +Checklist for Tests in Machine Learning Projects,This is a comprehensive checklist for evaluating the data and ML pipeline based on identified testing strategies from experts in the field. diff --git a/src/test_creation/archive/checklist_sys.csv/tests.csv b/src/test_creation/archive/checklist_sys.csv/tests.csv new file mode 100644 index 0000000..419b3cf --- /dev/null +++ b/src/test_creation/archive/checklist_sys.csv/tests.csv @@ -0,0 +1,9 @@ +ID,Topic,Title,Requirement,Explanation,References +2.1,Data Presence,Test Data Fetching and File Reading,"Verify that the data fetching API or data file reading functionality works correctly. Ensure that proper error handling is in place for scenarios such as missing files, incorrect file formats, and network errors.","Ensure that the code responsible for fetching or reading data can handle errors. This means if the file is missing, the format is wrong, or there's a network issue, the system should not crash but should provide a clear error message indicating the problem.",(general knowledge) +3.1,Data Quality,Validate Data Shape and Values,"Check that the data has the expected shape and that all values meet domain-specific constraints, such as non-negative distances.","Check that the data being used has the correct structure (like having the right number of columns) and that the values within the data make sense (e.g., distances should not be negative). This ensures that the data is valid and reliable for model training.","alexander2024Evaluating, ISO/IEC5259" +3.2,Data Quality,Check for Duplicate Records in Data,Check for duplicate records in the dataset and ensure that there are none.,"Ensure that the dataset does not contain duplicate entries, as these can skew the results and reduce the model's performance. The test should identify any repeated records so they can be removed or investigated.",ISO/IEC5259 +4.1,Data Ingestion,Verify Data Split Proportion,Check that the data is split into training and testing sets in the expected proportion.,"Confirm that the data is divided correctly into training and testing sets according to the intended ratio. This is crucial for ensuring that the model is trained and evaluated properly, with representative samples in each set.","openja2023studying, DBLP:conf/recsys/Kula15, singh2020mmf" +5.1,Model Fitting,Test Model Output Shape,Validate that the model's output has the expected shape.,"Ensure that the output from the model has the correct dimensions and structure. For example, in a classification task, if the model should output probabilities for each class, the test should verify that the output is an array with the correct dimensions. Ensuring the correct output shape helps prevent runtime errors and ensures consistency in how data is handled downstream.","openja2023studying, DBLP:conf/recsys/Kula15, singh2020mmf" +6.1,Model Evaluation,Verify Evaluation Metrics Implementation,Verify that the evaluation metrics are correctly implemented and appropriate for the model's task.,Confirm that the metrics used to evaluate the model are implemented correctly and are suitable for the specific task at hand. This helps in accurately assessing the model's performance and understanding its strengths and weaknesses.,"openja2023studying, DBLP:conf/recsys/Kula15, singh2020mmf" +6.2,Model Evaluation,Evaluate Model's Performance Against Thresholds,"Compute evaluation metrics for both the training and testing datasets and ensure that these metrics exceed predefined threshold values, indicating acceptable model performance.","This ensures that the model's performance meets or exceeds certain benchmarks. By setting thresholds for metrics like accuracy or precision, you can automatically flag models that underperform or overfit. This is crucial for maintaining a baseline quality of results and for ensuring that the model meets the requirements necessary for deployment.","openja2023studying, DBLP:conf/recsys/Kula15, singh2020mmf" +8.1,Data Quality (Optional),Validate Outliers Detection and Handling,Detect outliers in the dataset. Ensure that the outlier detection mechanism is sensitive enough to flag true outliers while ignoring minor anomalies.,The detection method should be precise enough to catch significant anomalies without being misled by minor variations. This is important for maintaining data quality and ensuring the model's reliability in certain projects.,ISO/IEC5259 diff --git a/src/test_creation/archive/checklist_sys.csv/topics.csv b/src/test_creation/archive/checklist_sys.csv/topics.csv new file mode 100644 index 0000000..c35d79a --- /dev/null +++ b/src/test_creation/archive/checklist_sys.csv/topics.csv @@ -0,0 +1,9 @@ +ID,Topic,Description +1,General,The following items describe best practices for all tests to be written. +2,Data Presence,"The following items describe tests that need to be done for testing the presence of data. This area of tests mainly concern whether the reading and saving operations are behaving as expected, and any unexpected behavior would not be passed silently." +3,Data Quality,"The following items describe tests that need to be done for testing the quality of data. This area of tests mainly concern whether the data supplied is in the expected format, data containing null values or outliers to make sure that the data processing pipeline is robust." +4,Data Ingestion,The following items describe tests that need to be done for testing if the data is ingestion properly. +5,Model Fitting,The following items describe tests that need to be done for testing the model fitting process. The unit tests written for this section usually mock model load and model predictions similarly to mocking file access. +6,Model Evaluation,The following items describe tests that need to be done for testing the model evaluation process. +7,Artifact Testing,"The following items involves explicit checks for behaviors that we expect the artifacts e.g. models, plots, etc., to follow." +8,Data Quality (Optional),"The following items describe tests that need to be done for testing the quality of data, but they may not be applicable to all projects." diff --git a/src/test_creation/archive/modules/llm_eval/consistency_eval.py b/src/test_creation/archive/modules/llm_eval/consistency_eval.py new file mode 100644 index 0000000..4a3b75e --- /dev/null +++ b/src/test_creation/archive/modules/llm_eval/consistency_eval.py @@ -0,0 +1,70 @@ +import pandas as pd +from ..workflow.parse import ResponseParser + + +class ConsistencyEvaluator: + def __init__(self): + self.evaluation_reports = None + + def evaluate(self, models, num_test_runs=2, verbose=False, version_before_refactoring=False): # FIXME: version_before_refactoring is for demo purpose, to be removed + """ + Input the initialized TestEvaluator models, test run `num_test_runs` times to obtain the result + models = [{'name': 'model_no1', 'model': {{model object}}}, ...] + """ + results = [] + for item in models: + if verbose: + print(f'Model: {item['name']}') + + for test_no in range(num_test_runs): + if verbose: + print(f'Test Run No.: {test_no+1}') + + result = dict() + model = item['model'] + if version_before_refactoring: + model.evaluate() + report_df = pd.DataFrame(model.evaluation_result)['report'].explode('report').apply(pd.Series) + report_df = report_df.groupby(['ID', 'Title']).agg({ + 'Score': ['max', 'count'], + 'Functions': ['sum'] + }) + report_df.columns = ['is_Satisfied', 'n_files_tested', 'functions'] + + result['report'] = report_df + result['score'] = model.get_completeness_score(score_format='number') + result['model_name'] = item['name'] + result['test_no'] = test_no + 1 + else: + response = model.evaluate() + parser = ResponseParser(response) + result['score'] = parser.get_completeness_score(score_format='number') + result['report'] = parser.evaluation_report + result['model_name'] = item['name'] + result['test_no'] = test_no+1 + results.append(result) + self.evaluation_reports = pd.DataFrame(results) + return + + def get_completeness_score_dist(self): + """ + Obtain the distribution of the Test Completeness scores + """ + completeness_score_df = self.evaluation_reports.drop(columns='report') + completeness_score_df = completeness_score_df.pivot(index='model_name', columns='test_no', values='score') + return completeness_score_df + + def get_consistency_dist(self): + """ + Obtain the distribution of the consistency per checklist item + """ + consistency_df = pd.DataFrame() + for idx in self.evaluation_reports.index: + result = self.evaluation_reports.iloc[idx]['report'].reset_index() + result['test_no'] = self.evaluation_reports.iloc[idx]['test_no'] + result['model_name'] = self.evaluation_reports.iloc[idx]['model_name'] + consistency_df = pd.concat([consistency_df, result], axis = 0, ignore_index=True) + consistency_df = consistency_df.pivot(index=['model_name', 'ID'], columns=['test_no'], values=['is_Satisfied']) + consistency_df.columns = consistency_df.columns.droplevel(level=0) + consistency_df['consistency'] = consistency_df.eq(consistency_df.iloc[:, 0], axis=0).all(1) + return consistency_df diff --git a/src/test_creation/archive/modules/workflow/__init__.py b/src/test_creation/archive/modules/workflow/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/test_creation/archive/modules/workflow/files.py b/src/test_creation/archive/modules/workflow/files.py new file mode 100644 index 0000000..7e1bb91 --- /dev/null +++ b/src/test_creation/archive/modules/workflow/files.py @@ -0,0 +1,19 @@ +from abc import ABC, abstractmethod +from ..code_analyzer.repo import Repository + + +class RepoFileExtractor(ABC): + def __init__(self, repo: Repository): + self._repo = repo + + @abstractmethod + def extract(self) -> list: + pass + + +class PythonTestFileExtractor(RepoFileExtractor): + def __init__(self, repository: Repository): + super().__init__(repository) + + def extract(self) -> list: + return self._repo.list_test_files()['Python'] diff --git a/src/test_creation/archive/modules/workflow/parse.py b/src/test_creation/archive/modules/workflow/parse.py new file mode 100644 index 0000000..9b38a96 --- /dev/null +++ b/src/test_creation/archive/modules/workflow/parse.py @@ -0,0 +1,29 @@ +import pandas as pd + + +class ResponseParser: + def __init__(self, response): + self.response = response + self.evaluation_report = None + + def get_completeness_score(self, score_format: str = 'fraction', verbose: bool = False) -> str: + report_df = pd.DataFrame(self.response)['report'].explode('report').apply(pd.Series) + report_df = report_df.groupby(['ID', 'Title']).agg({ + 'Score': ['max', 'count'], + 'Functions': ['sum'] + }) + report_df.columns = ['is_Satisfied', 'n_files_tested', 'functions'] + self.evaluation_report = report_df + + if score_format == 'fraction': + score = f"{report_df['is_Satisfied'].sum()}/{report_df['is_Satisfied'].count()}" + elif score_format == 'number': + score = report_df['is_Satisfied'].sum()/report_df['is_Satisfied'].count() + + if verbose: + print("Report:") + print(report_df) + print() + print(f'Score: {score}') + print() + return score diff --git a/src/test_creation/demo_llm-eval.ipynb b/src/test_creation/demo_llm-eval.ipynb new file mode 100644 index 0000000..afdb71b --- /dev/null +++ b/src/test_creation/demo_llm-eval.ipynb @@ -0,0 +1,904 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2b45390c-f495-42bc-b0c9-64d1189a81ce", + "metadata": {}, + "source": [ + "### current version" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d2c1ead7-9d5b-4414-80e2-07092ba180ca", + "metadata": {}, + "outputs": [], + "source": [ + "from analyze import *\n", + "from modules.llm_eval.consistency_eval import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1acc7761-b05e-44ba-9e75-a051ff5af2e4", + "metadata": {}, + "outputs": [], + "source": [ + "llm = ChatOpenAI(model=\"gpt-3.5-turbo\", temperature=0)\n", + "checklist = Checklist(\"archive/checklist_sys.csv/\", checklist_format=ChecklistFormat.CSV)\n", + "extractor = PythonTestFileExtractor(Repository(\"../../data/raw/openja/lightfm_demo\"))\n", + "\n", + "evaluator = TestEvaluator(llm, extractor, checklist)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5cef8b2c-3484-4452-94ea-cdd664afeff0", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:28<00:00, 9.46s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:35<00:00, 11.99s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:41<00:00, 13.91s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:30<00:00, 10.13s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:27<00:00, 9.01s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:30<00:00, 10.16s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:39<00:00, 13.29s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:38<00:00, 12.96s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:46<00:00, 15.47s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:40<00:00, 13.38s/it]\n" + ] + } + ], + "source": [ + "eval_evaluator = ConsistencyEvaluator()\n", + "\n", + "eval_evaluator.evaluate(models=[{'name': 'model_no1', 'model': evaluator}], num_test_runs=10) # TODO: recommended 20 ~ 30" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0fae3f80-45cb-42ff-b807-7e09d525c945", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
test_no12345678910
model_name
model_no10.18750.250.31250.250.250.250.31250.250.18750.3125
\n", + "
" + ], + "text/plain": [ + "test_no 1 2 3 4 5 6 7 8 9 \\\n", + "model_name \n", + "model_no1 0.1875 0.25 0.3125 0.25 0.25 0.25 0.3125 0.25 0.1875 \n", + "\n", + "test_no 10 \n", + "model_name \n", + "model_no1 0.3125 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval_evaluator.get_completeness_score_dist()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ecf75ae9-bd88-42d2-a6b2-31710109b4c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0021267361111111114" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval_evaluator.get_completeness_score_dist().iloc[0].var()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "75795230-3c47-4961-885a-9bd774f49be0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
test_no12345678910consistency
model_nameID
model_no12.10.00.00.50.00.00.00.50.00.50.5False
3.10.00.00.00.00.00.00.00.00.00.0True
3.20.00.00.00.00.00.00.00.00.00.0True
4.11.01.01.01.01.01.01.00.50.51.0False
5.10.00.50.50.50.50.50.50.50.00.5False
6.10.50.50.50.50.50.50.50.50.50.5True
6.20.00.00.00.00.00.00.00.50.00.0False
8.10.00.00.00.00.00.00.00.00.00.0True
\n", + "
" + ], + "text/plain": [ + "test_no 1 2 3 4 5 6 7 8 9 10 consistency\n", + "model_name ID \n", + "model_no1 2.1 0.0 0.0 0.5 0.0 0.0 0.0 0.5 0.0 0.5 0.5 False\n", + " 3.1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True\n", + " 3.2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True\n", + " 4.1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.5 0.5 1.0 False\n", + " 5.1 0.0 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.0 0.5 False\n", + " 6.1 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 True\n", + " 6.2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.5 0.0 0.0 False\n", + " 8.1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval_evaluator.get_consistency_dist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "930700eb-33ce-448a-8f33-dfe82030058a", + "metadata": {}, + "outputs": [], + "source": [ + "# import pandas as pd\n", + "# pd.DataFrame([r for r in response if r['file'] == '../../data/raw/openja/lightfm_demo/tests/test_data.py'][0]['report']).iloc[4]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "488ecd30-c7d7-4065-b07b-07c673e670b3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "a5c024d8-bf90-48e5-8e68-e74c62f145ac", + "metadata": {}, + "source": [ + "### last week's version " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bdb95d49-491d-4030-a49f-6b58c959824b", + "metadata": {}, + "outputs": [], + "source": [ + "from archive.analyze import TestEvaluator as TestEvaluatorPrev\n", + "from archive.modules.llm_eval.consistency_eval import ConsistencyEvaluator as ConsistencyEvaluatorPrev" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0aabc4af-1da3-42f0-be47-966754962712", + "metadata": {}, + "outputs": [], + "source": [ + "evaluator_prev = TestEvaluatorPrev('../../data/raw/openja/lightfm_demo')\n", + "evaluator_prev.load_checklist('archive/checklist_sys.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ad0a59a9-185c-4f17-a0dd-fa2534958ecb", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:44<00:00, 14.80s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:45<00:00, 15.25s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:42<00:00, 14.06s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:45<00:00, 15.25s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:42<00:00, 14.15s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:40<00:00, 13.58s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:41<00:00, 13.89s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:41<00:00, 13.80s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:48<00:00, 16.33s/it]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:44<00:00, 14.97s/it]\n" + ] + } + ], + "source": [ + "eval_prev_evaluator = ConsistencyEvaluatorPrev()\n", + "\n", + "eval_prev_evaluator.evaluate(\n", + " models=[{'name': 'model_no1', 'model': evaluator_prev}], \n", + " num_test_runs=10, \n", + " version_before_refactoring=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d717ba5d-dc9d-477d-a9db-ccb993f48f09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
test_no12345678910
model_name
model_no10.68750.6250.81250.6250.750.6250.6250.68750.68750.625
\n", + "
" + ], + "text/plain": [ + "test_no 1 2 3 4 5 6 7 8 9 \\\n", + "model_name \n", + "model_no1 0.6875 0.625 0.8125 0.625 0.75 0.625 0.625 0.6875 0.6875 \n", + "\n", + "test_no 10 \n", + "model_name \n", + "model_no1 0.625 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval_prev_evaluator.get_completeness_score_dist()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "273db18c-13c4-4c86-a4c8-f42e0b0e37c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.004166666666666667" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval_prev_evaluator.get_completeness_score_dist().iloc[0].var()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5a682a42-8807-48c6-9de4-0558838e3ccd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
test_no12345678910consistency
model_nameID
model_no12.11.01.01.01.01.01.01.01.01.01.0True
3.11.00.51.01.01.00.50.51.01.01.0False
3.20.00.01.00.01.00.00.00.00.00.0False
4.11.01.01.01.01.01.01.01.01.01.0True
5.10.50.50.50.50.50.50.50.50.50.5True
6.11.01.01.01.01.01.01.01.01.01.0True
6.21.01.01.00.50.51.01.01.01.00.5False
8.10.00.00.00.00.00.00.00.00.00.0True
\n", + "
" + ], + "text/plain": [ + "test_no 1 2 3 4 5 6 7 8 9 10 consistency\n", + "model_name ID \n", + "model_no1 2.1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 True\n", + " 3.1 1.0 0.5 1.0 1.0 1.0 0.5 0.5 1.0 1.0 1.0 False\n", + " 3.2 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 False\n", + " 4.1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 True\n", + " 5.1 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 True\n", + " 6.1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 True\n", + " 6.2 1.0 1.0 1.0 0.5 0.5 1.0 1.0 1.0 1.0 0.5 False\n", + " 8.1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 True" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval_prev_evaluator.get_consistency_dist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07875448-9c58-4ec0-94b8-de9be8870011", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "ffd6e714-64ce-4413-a58c-9a4b06cfc546", + "metadata": {}, + "source": [ + "### F-test to examine the variance difference" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "8a44f36f-ddd2-4015-87f2-0ba683738d3c", + "metadata": {}, + "outputs": [], + "source": [ + "var_curr = eval_evaluator.get_completeness_score_dist().iloc[0].var()\n", + "var_prev = eval_prev_evaluator.get_completeness_score_dist().iloc[0].var()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b731a2fe-5b69-4de5-a174-f591eba00231", + "metadata": {}, + "outputs": [], + "source": [ + "df_curr = eval_evaluator.get_completeness_score_dist().shape[1] - 1\n", + "df_prev = eval_prev_evaluator.get_completeness_score_dist().shape[1] - 1" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3a6c5aab-67d2-42fb-96ce-844bfa0eaa37", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.9591836734693875" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "F = var_prev / var_curr #var_curr / var_prev if var_prev < var_curr else var_prev / var_curr\n", + "F" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c4764069-43b6-44b8-a7ea-199ef66875d5", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install scipy" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "76c35604-b7d4-4803-8f87-f527acd3cb72", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "p-value: 0.1654120302834744\n", + "\n", + "2-tail test:\n", + " Failed to reject the null hypothesis: Var(Completeness_Score(Current Version)) == Var(Completeness_Score(Last Week Version))\n" + ] + } + ], + "source": [ + "import scipy\n", + "tail = 2\n", + "alpha = 0.05 #Or whatever you want your alpha to be.\n", + "p_value = 1 - scipy.stats.f.cdf(F, df_prev, df_curr) \n", + "\n", + "print(f\"p-value: {p_value}\")\n", + "print()\n", + "\n", + "print(f\"{tail}-tail test:\")\n", + "if p_value < alpha / 2:\n", + " print(\" Reject the null hypothesis: Var(Completeness_Score(Current Version)) == Var(Completeness_Score(Last Week Version))\")\n", + "else:\n", + " print(\" Failed to reject the null hypothesis: Var(Completeness_Score(Current Version)) == Var(Completeness_Score(Last Week Version))\")" + ] + }, + { + "cell_type": "markdown", + "id": "d54e961f-d19e-47f5-99c4-145fea3e3b23", + "metadata": {}, + "source": [ + "### reference\n", + "https://www.itl.nist.gov/div898/handbook/eda/section3/eda359.html \n", + "https://www.statisticshowto.com/probability-and-statistics/hypothesis-testing/f-test/#:~:text=F%20Test%20to%20Compare%20Two%20Variances,-A%20Statistical%20F&text=If%20the%20variances%20are%20equal,when%20running%20an%20F%20Test \n", + "https://stackoverflow.com/questions/21494141/how-do-i-do-a-f-test-in-python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1a7c9ad-1f31-4134-86c2-1b7036d8e66c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:test-creation]", + "language": "python", + "name": "conda-env-test-creation-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}