In [55]:
from dotenv import load_dotenv

from langchain_community.document_loaders import DirectoryLoader, PythonLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain

from langchain.memory import ChatMessageHistory
from langchain_core.messages import AIMessage, HumanMessage

load_dotenv()

from modules.repo import Repository
from collections import defaultdict
from tqdm import tqdm

In [72]:
class TestEvaluator:
    def __init__(self, repo_path=None):
        self.repo = None
        self.test_fps = [] # test file paths
        self.py_splits = []

        self.checklist = ''
        self.system_message = []
        self.model = 'gpt-4o'
        self.temperature = 0.1
        self.chain = None

        # self.evaluation_message = """
        #     Evaluate whether the codes has fulfilled the requirements and deliver a completion score. Do not include a summary evaluation.
        #     Desired JSON format:
        #         {
        #             "Requirement Title":
        #             "Requirement":
        #             "Observation":
        #             "Related Functions": [ ... ]
        #             "Evaluation": (1 for Fulfilled / 0.5 for Partially fulfilled / 0 for Not fulfilled)
        #         }
        # """
        self.evaluation_message = """
            Your task is to answer each question in the checklist using only the provided test functions.
            If an answer to the question is provided, it must be annotated with a citation of the test function(s) in the Observation session.
            Then, decide the completion score in a fraction format based on your answers. The denominator should be the number of checklist items.
            Desired format:
                Checklist Evaluation:
                    Requirement Title:
                    Requirement:
                    Observation:
                    Evaluation: Satisfied/Partially Satisfied/Not Satisfied
                Completion Score: Number of satisfied requirements/Number of requirements
                    Number of satisfied requirements:
                    Number of partially satisfied requirements:
                    Number of not satisfied requirements:
        """

        if repo_path is not None:
            self.load_repo(repo_path)
        
    def load_repo(self, repo_path):
        self.repo = Repository(repo_path)
        self.test_fps = self.repo.list_test_files()['Python']

    def load_test_file(self, file_path, overwrite=True):
        loader = PythonLoader(file_path)
        py = loader.load()
        py_splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_documents(py)
        
        if overwrite:
            self.py_splits = py_splits
        
        return py_splits

    # def load_all_test_files(self):
    #     self.py_splits = []
    #     for fp in self.test_fps:
    #         self.py_splits += self.load_test_file(fp, overwrite=False)

    def load_test_dir(self, dir_path):
        loader = DirectoryLoader(
            dir_path,
            glob="**/*.py", 
            show_progress=True, 
            loader_cls=PythonLoader
        )
        docs = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
        self.py_splits = text_splitter.split_documents(docs)

    def load_checklist(self):
        self.checklist = """
            Each test function should have a clear, descriptive name that accurately reflects the test's purpose and the specific functionality or scenario it examines.
            Each test should focus on a single scenario, using only one set of mock data and testing one specific behavior or outcome to ensure clarity and isolate issues.
            Assertions within tests should be focused and narrow. Ensure you are only testing relevant behaviors of complex objects and not including unrelated assertions.
            Keep any modifications to objects and the corresponding assertions close together in your tests to maintain readability and clearly show the cause-and-effect relationship.
            Ensure that data-loading functions correctly load files when they exist and match the expected format, handle non-existent files appropriately, and return the expected results.
            Verify that functions for saving data and figures perform write operations correctly, checking that the operation succeeds and the content matches the expected format.
            Ensure all data files are non-empty and contain the necessary data required for further analysis or processing tasks.
            Verify that the data to be ingested matches the format expected by processing algorithms (like pd.DataFrame for CSVs or np.array for images) and adheres to the expected schema.
            Check that data files are free from unexpected null values and identify any outliers that could affect the analysis. Tests should explicitly state if null values are part of expected data.
            Test that a fixed input to a function or model produces the expected output, focusing on one verification per test to ensure predictable behavior.
            Confirm that the model accepts inputs of the correct shapes and types and produces outputs that meet the expected shapes and types without any errors.
            For parametric models, ensure that the model's weights update correctly per training iteration. For non-parametric models, verify that the data fits correctly into the model.
            Ensure the shape of the model's output aligns with the expected structure based on the task, such as matching the number of labels in a classification task.
            Verify that the model's output values are appropriate for its task, such as outputting probabilities that sum to 1 for classification tasks.
            If using gradient descent for training, verify that a single gradient step on a batch of data results in a decrease in the model's training loss.
            Confirm that there is no leakage of data between training, validation, and testing sets, or across cross-validation folds, to ensure the integrity of the splits.
        """

    def init_system_message(self):
        if len(self.checklist) == 0:
            self.load_checklist()
            
        self.system_message = [
            ("system", "You are a senior machine learning engineer who specializes in performing Machine Learning system testing. Extract and analyze the test functions from the codes:\n\n{context}"),
            ("system", f"Here is the Machine Learning system testing checklist delimited by triple quotes '''{self.checklist}'''")
        ]

    def init_chain(self, system_message=None, model=None):
        if system_message is None:
            if len(self.system_message) == 0:
                self.init_system_message()
            system_message = self.system_message
        else:
            self.system_message = system_message

        if model is None:
            model = self.model
        else:
            self.model = model
            
        prompt = ChatPromptTemplate.from_messages(
            system_message + [
                MessagesPlaceholder(variable_name="messages")
            ]
        )
        chat = ChatOpenAI(model=model, temperature=self.temperature)

        chain = create_stuff_documents_chain(chat, prompt)
        self.chain = chain
        return chain

    def get_ai_response(self, message, context, history=None):
        if self.chain is None:
            self.init_chain()

        if history is None:
            history = ChatMessageHistory()

        history.add_user_message(message)
        
        response = self.chain.invoke({
            "context": context, 
            "messages": history.messages
        })
        history.add_ai_message(response)

        return response, history

    def evaluate(self):
        return self.get_ai_response(
            message=self.evaluation_message, 
            context=self.py_splits
        )

In [73]:
test = TestEvaluator("../../data/raw/openja/lightfm/")

In [75]:
test.load_test_dir('../../data/raw/openja/lightfm/tests/')

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 2958.42it/s]


In [76]:
#test.load_test_file(test.test_fps[2])
#test.load_all_test_files()
len(test.py_splits)

119

In [77]:
report, history = test.evaluate()

In [78]:
print(report)

Checklist Evaluation:

Requirement Title: Clear and Descriptive Test Names
Requirement: Each test function should have a clear, descriptive name that accurately reflects the test's purpose and the specific functionality or scenario it examines.
Observation: The test functions have clear and descriptive names such as `test_in_positives`, `test_movielens_accuracy`, `test_logistic_precision`, `test_bpr_precision`, etc.
Evaluation: Satisfied

Requirement Title: Single Scenario Focus
Requirement: Each test should focus on a single scenario, using only one set of mock data and testing one specific behavior or outcome to ensure clarity and isolate issues.
Observation: Each test function focuses on a single scenario, such as `test_movielens_accuracy` focusing on the accuracy of the model on the Movielens dataset, and `test_bpr_precision` focusing on the precision of the BPR loss function.
Evaluation: Satisfied

Requirement Title: Focused and Narrow Assertions
Requirement: Assertions within tes