In [1]:
from dotenv import load_dotenv

from langchain_community.document_loaders import DirectoryLoader, PythonLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain

from langchain.memory import ChatMessageHistory
from langchain_core.messages import AIMessage, HumanMessage

load_dotenv()

from modules.code_analyzer.repo import Repository
from collections import defaultdict
from tqdm import tqdm

from modules.checklist.checklist import Checklist, ChecklistFormat
import json
import pandas as pd

In [2]:
class TestEvaluator:
    def __init__(self, repo_path=None):
        self.repo = None
        self.test_fps = [] # test file paths
        self.test_dir_path = '' # test dir path # FIXME: required by `load_test_dir`
        self.py_splits = []

        # FIXME: Tony's "Checklist - After Engineering" version
        self.checklist = """
            Each test function should have a clear, descriptive name that accurately reflects the test's purpose and the specific functionality or scenario it examines.
            Each test should focus on a single scenario, using only one set of mock data and testing one specific behavior or outcome to ensure clarity and isolate issues.
            Assertions within tests should be focused and narrow. Ensure you are only testing relevant behaviors of complex objects and not including unrelated assertions.
            Keep any modifications to objects and the corresponding assertions close together in your tests to maintain readability and clearly show the cause-and-effect relationship.
            Ensure that data-loading functions correctly load files when they exist and match the expected format, handle non-existent files appropriately, and return the expected results.
            Verify that functions for saving data and figures perform write operations correctly, checking that the operation succeeds and the content matches the expected format.
            Ensure all data files are non-empty and contain the necessary data required for further analysis or processing tasks.
            Verify that the data to be ingested matches the format expected by processing algorithms (like pd.DataFrame for CSVs or np.array for images) and adheres to the expected schema.
            Check that data files are free from unexpected null values and identify any outliers that could affect the analysis. Tests should explicitly state if null values are part of expected data.
            Test that a fixed input to a function or model produces the expected output, focusing on one verification per test to ensure predictable behavior.
            Confirm that the model accepts inputs of the correct shapes and types and produces outputs that meet the expected shapes and types without any errors.
            For parametric models, ensure that the model's weights update correctly per training iteration. For non-parametric models, verify that the data fits correctly into the model.
            Ensure the shape of the model's output aligns with the expected structure based on the task, such as matching the number of labels in a classification task.
            Verify that the model's output values are appropriate for its task, such as outputting probabilities that sum to 1 for classification tasks.
            If using gradient descent for training, verify that a single gradient step on a batch of data results in a decrease in the model's training loss.
            Confirm that there is no leakage of data between training, validation, and testing sets, or across cross-validation folds, to ensure the integrity of the splits.
        """
        self.system_message = []
        self.model = 'gpt-3.5-turbo'
        self.temperature = 0
        self.chain = None

        # self.evaluation_message = """
        #     Your task is to answer each question in the checklist using only the provided test functions.
        #     If an answer to the question is provided, it must be annotated with a citation of the test function(s) in the Observation session.
        #     Then, decide the completion score in a fraction format based on your answers. The denominator should be the number of checklist items.
        #     Desired format:
        #         Checklist Evaluation:
        #             ID: 
        #             Title:
        #             Requirement:
        #             Observation:
        #             Evaluation: Satisfied/Partially Satisfied/Not Satisfied
        #             Score: (1 for Satisfied / 0.5 for Partially Satisfied / 0 for Not Satisfied)
        #         Completion Score: Number of satisfied requirements/Number of requirements
        #             Number of satisfied requirements:
        #             Number of partially satisfied requirements:
        #             Number of not satisfied requirements:
        # """
        self.evaluation_message = """
            Your task is to answer each question in the checklist using only the provided test functions.
            If an answer to the question is provided, it must be annotated with a citation of the test function(s) in the Observation session.
            Output a JSON format:
                {
                    "ID": 
                    "Title":
                    "Requirement":
                    "Observation":
                    "Functions": [ ... ]
                    "Evaluation": Satisfied/Partially Satisfied/Not Satisfied
                    "Score": (1 for Satisfied / 0.5 for Partially Satisfied / 0 for Not Satisfied)
                }
        """

        self.evaluation_result = None

        if repo_path is not None:
            self.load_repo(repo_path)
        
    def load_repo(self, repo_path):
        self.repo = Repository(repo_path)
        self.test_fps = self.repo.list_test_files()['Python']

    def load_test_file(self, file_path, overwrite=True):
        loader = PythonLoader(file_path)
        py = loader.load()
        py_splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_documents(py)
        
        if overwrite:
            self.py_splits = py_splits
        
        return py_splits

    # def load_all_test_files(self):
    #     self.py_splits = []
    #     for fp in self.test_fps:
    #         self.py_splits += self.load_test_file(fp, overwrite=False)

    def load_test_dir(self, dir_path):
        self.test_dir_path = dir_path
        
        loader = DirectoryLoader(
            dir_path,
            glob="**/*.py", 
            show_progress=True, 
            loader_cls=PythonLoader
        )
        docs = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
        self.py_splits = text_splitter.split_documents(docs)

    def load_checklist(self, checklist_path):
        raw_checklist = Checklist(checklist_path, checklist_format=ChecklistFormat.YAML)
        
        checklist = []
        for item in raw_checklist.get_all_tests():
            checklist.append({
                'ID': item['ID'],
                'Title': item['Title'],
                'Requirement': item['Requirement']
            })
            
        self.checklist = json.dumps(checklist).replace('{', '[').replace('}', ']')

    def init_system_message(self):
        if len(self.checklist) == 0:
            self.load_checklist()
            
        self.system_message = [
            ("system", "You are a senior machine learning engineer who specializes in performing Machine Learning system testing. Extract and analyze the test functions from the codes:\n\n{context}"),
            ("system", f"Here is the Machine Learning system testing checklist delimited by triple quotes '''{self.checklist}'''")
        ]

    def init_chain(self, system_message=None, model=None):
        if system_message is None:
            if len(self.system_message) == 0:
                self.init_system_message()
            system_message = self.system_message
        else:
            self.system_message = system_message

        if model is None:
            model = self.model
        else:
            self.model = model
            
        prompt = ChatPromptTemplate.from_messages(
            system_message + [
                MessagesPlaceholder(variable_name="messages")
            ]
        )
        chat = ChatOpenAI(model=model, temperature=self.temperature)

        chain = create_stuff_documents_chain(chat, prompt)
        self.chain = chain
        return chain

    def get_ai_response(self, message, context, history=None):
        if self.chain is None:
            self.init_chain()

        if history is None:
            history = ChatMessageHistory()

        history.add_user_message(message)
        
        response = self.chain.invoke({
            "context": context, 
            "messages": history.messages
        })
        history.add_ai_message(response)

        return response, history

    def get_evaluation_response(self, py_splits=None):
        if py_splits is None:
            py_splits = self.py_splits
            
        return self.get_ai_response(
            message=self.evaluation_message, 
            context=py_splits
        )

    # FIXME: combine evaluation
    # to be tested
    def extract_json(self, response, start='[', end=']'):
        start_idx = response.index(start)
        end_idx = response[::-1].index(end)
        if end_idx == 0:
            string = response[start_idx:]
        else:
            string = response[start_idx:-end_idx]
        return json.loads(string)

    def evaluate(self, on_file=True):
        result = []
        if on_file:
            for fp in tqdm(self.test_fps):
                print(fp)
                self.load_test_file(fp)
                print(f"# splits: {len(self.test_fps)}")
                response, history = self.get_evaluation_response() # FIXME: it sometimes tests only part of the checklist items
                # print(response)
                report = self.extract_json(response)
                # print(report)
                for item in report:
                    item['file'] = fp
                result += [{
                    'file': fp,
                    'report': report,
                    'history': history
                }]
        else:
            load_test_dir(self.test_dir_path)
            response, history = self.get_evaluation_response()
            report = self.extract_json(response)
            for item in report:
                item['file'] = self.test_dir_path
            result += [{
                'file': self.test_dir_path,
                'report': report,
                'history': history
            }]

        self.evaluation_result = result
        return

    def get_completeness_score(self):
        report_df = pd.DataFrame(self.evaluation_result)['report'].explode('report').apply(pd.Series)
        report_df = report_df[report_df['Title'] != 'Dummy Title']
        report_df = report_df.groupby(['ID']).max('Score')
        score = f'{report_df['Score'].sum()}/{report_df['Score'].count()}'
        print(f'Score: {score}')
        return score

In [3]:
repo_path = "../../../lightfm/"
checklist_path = '../../checklist/checklist.yaml'
test = TestEvaluator(repo_path)
test.load_checklist(checklist_path)

In [23]:
#test.load_test_dir('../../data/raw/openja/lightfm/tests/')

In [4]:
test.evaluate()

  0%|                                                                                                  | 0/6 [00:00<?, ?it/s]

../../../lightfm/tests/test_fast_functions.py
# splits: 6


 17%|███████████████                                                                           | 1/6 [00:08<00:43,  8.69s/it]

../../../lightfm/tests/test_movielens.py
# splits: 6


 33%|██████████████████████████████                                                            | 2/6 [00:48<01:47, 26.85s/it]

../../../lightfm/tests/test_datasets.py
# splits: 6


 50%|█████████████████████████████████████████████                                             | 3/6 [01:17<01:23, 27.96s/it]

../../../lightfm/tests/test_cross_validation.py
# splits: 6


 67%|████████████████████████████████████████████████████████████                              | 4/6 [01:50<00:59, 29.75s/it]

../../../lightfm/tests/test_evaluation.py
# splits: 6


 83%|███████████████████████████████████████████████████████████████████████████               | 5/6 [02:23<00:30, 30.99s/it]

../../../lightfm/tests/test_data.py
# splits: 6


100%|██████████████████████████████████████████████████████████████████████████████████████████| 6/6 [02:52<00:00, 28.81s/it]


In [5]:
test.get_completeness_score()

Score: 5.5/16


'5.5/16'

In [56]:
reports = []
for res in test.evaluation_result:
    reports += res['report']

evaluationdf = pd.DataFrame(reports)
report = df.groupby(['ID', 'Title']).agg({
    'Score': ['max', 'count'],
    'Functions': ['sum'],
})

report

Unnamed: 0_level_0,Unnamed: 1_level_0,Score,Score,Functions
Unnamed: 0_level_1,Unnamed: 1_level_1,max,count,sum
ID,Title,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1.1,Write Descriptive Test Names,1,6,"[test_in_positives, test_movielens_accuracy, t..."
1.2,Keep Tests Focused,1,6,"[test_in_positives, test_movielens_accuracy, t..."
1.3,Prefer Narrow Assertions in Unit Tests,1,6,"[test_in_positives, test_movielens_accuracy, t..."
1.4,Keep Cause and Effect Clear,1,6,"[test_in_positives, test_movielens_accuracy, t..."
2.1,Ensure Data File Loads as Expected,0,5,[]
2.2,Ensure Saving Data/Figures Function Works as Expected,0,5,[]
3.1,Files Contain Data,0,5,[]
3.2,Data in the Expected Format,1,5,"[test_basic_fetching_movielens, test_basic_fet..."
3.3,Data Does Not Contain Null Values or Outliers,0,5,[]
4.1,Cleaning and Transformation Functions Work as Expected,0,5,[]


In [50]:
df.query('ID == "1.1"')['file'].str[-30:]

0     m/tests/test_fast_functions.py
4     ightfm/tests/test_movielens.py
22    lightfm/tests/test_datasets.py
40    tests/test_cross_validation.py
56    ghtfm/tests/test_evaluation.py
74    nja/lightfm/tests/test_data.py
92    enja/lightfm/tests/test_api.py
Name: file, dtype: object

In [51]:
df.query('ID == "5.5"')['file'].str[-30:]

18     ightfm/tests/test_movielens.py
36     lightfm/tests/test_datasets.py
54     tests/test_cross_validation.py
70     ghtfm/tests/test_evaluation.py
88     nja/lightfm/tests/test_data.py
106    enja/lightfm/tests/test_api.py
Name: file, dtype: object

In [56]:
test.evaluation_result[0]['report']

[{'ID': '1.1',
  'Title': 'Write Descriptive Test Names',
  'Requirement': "Each test function should have a clear, descriptive name that accurately reflects the test's purpose and the specific functionality or scenario it examines.",
  'Observation': "The test function 'test_in_positives' has a descriptive name that indicates it is testing for positive cases related to the input data.",
  'Functions': ['test_in_positives'],
  'Evaluation': 'Satisfied',
  'Score': 1,
  'file': '../../data/raw/openja/lightfm/tests/test_fast_functions.py'},
 {'ID': '1.2',
  'Title': 'Keep Tests Focused',
  'Requirement': 'Each test should focus on a single scenario, using only one set of mock data and testing one specific behavior or outcome to ensure clarity and isolate issues.',
  'Observation': "The test function 'test_in_positives' focuses on testing the behavior of the '__test_in_positives' function with a specific matrix setup for positive cases.",
  'Functions': ['test_in_positives'],
  'Evaluatio

In [60]:
report[('Score', 'max')].mean()

0.6111111111111112

In [5]:
test.load_test_file(test.test_fps[2])
#test.load_all_test_files()
len(test.py_splits)

4

In [6]:
report, history = test.get_evaluation_response(test.py_splits)

In [10]:
result = 

<function str.index>

In [13]:
test.extract_json(report)

[{'ID': '1.1',
  'Title': 'Write Descriptive Test Names',
  'Requirement': "Each test function should have a clear, descriptive name that accurately reflects the test's purpose and the specific functionality or scenario it examines.",
  'Observation': "The test functions have descriptive names such as 'test_basic_fetching_movielens' and 'test_basic_fetching_stackexchange'.",
  'Functions': ['test_basic_fetching_movielens',
   'test_basic_fetching_stackexchange'],
  'Evaluation': 'Satisfied',
  'Score': 1},
 {'ID': '1.2',
  'Title': 'Keep Tests Focused',
  'Requirement': 'Each test should focus on a single scenario, using only one set of mock data and testing one specific behavior or outcome to ensure clarity and isolate issues.',
  'Observation': 'The test functions focus on specific scenarios related to fetching data from movielens and stackexchange datasets.',
  'Functions': ['test_basic_fetching_movielens',
   'test_basic_fetching_stackexchange'],
  'Evaluation': 'Satisfied',
  'Sco

In [106]:
print(report)

Checklist Evaluation:

1. Requirement Title: Write Descriptive Test Names
   Requirement: Each test function should have a clear, descriptive name that accurately reflects the test's purpose and the specific functionality or scenario it examines.
   Observation: The test functions are named `test_basic_fetching_movielens` and `test_basic_fetching_stackexchange`, which are reasonably descriptive.
   Evaluation: Satisfied

2. Requirement Title: Keep Tests Focused
   Requirement: Each test should focus on a single scenario, using only one set of mock data and testing one specific behavior or outcome to ensure clarity and isolate issues.
   Observation: The tests cover multiple scenarios within a single function, such as different configurations for fetching data and different datasets.
   Evaluation: Not Satisfied

3. Requirement Title: Prefer Narrow Assertions in Unit Tests
   Requirement: Assertions within tests should be focused and narrow. Ensure you are only testing relevant behavior

In [107]:
test.checklist

"[[id: 0, Title: Write Descriptive Test Names, Requirement: Each test function should have a clear, descriptive name that accurately reflects the test's purpose and the specific functionality or scenario it examines.\\n], [id: 1, Title: Keep Tests Focused, Requirement: Each test should focus on a single scenario, using only one set of mock data and testing one specific behavior or outcome to ensure clarity and isolate issues.\\n], [id: 2, Title: Prefer Narrow Assertions in Unit Tests, Requirement: Assertions within tests should be focused and narrow. Ensure you are only testing relevant behaviors of complex objects and not including unrelated assertions.\\n], [id: 3, Title: Keep Cause and Effect Clear, Requirement: Keep any modifications to objects and the corresponding assertions close together in your tests to maintain readability and clearly show the cause-and-effect relationship.\\n], [id: 4, Title: Ensure Data File Loads as Expected, Requirement: Ensure that data-loading functions

In [95]:
tmp

"[\\{id: 0, Title: Write Descriptive Test Names, Requirement: Each test function should have a clear, descriptive name that accurately reflects the test's purpose and the specific functionality or scenario it examines.\\n\\}, \\{id: 1, Title: Keep Tests Focused, Requirement: Each test should focus on a single scenario, using only one set of mock data and testing one specific behavior or outcome to ensure clarity and isolate issues.\\n\\}, \\{id: 2, Title: Prefer Narrow Assertions in Unit Tests, Requirement: Assertions within tests should be focused and narrow. Ensure you are only testing relevant behaviors of complex objects and not including unrelated assertions.\\n\\}, \\{id: 3, Title: Keep Cause and Effect Clear, Requirement: Keep any modifications to objects and the corresponding assertions close together in your tests to maintain readability and clearly show the cause-and-effect relationship.\\n\\}, \\{id: 4, Title: Ensure Data File Loads as Expected, Requirement: Ensure that data