In [1]:
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_community.document_loaders import DirectoryLoader, PythonLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain

# Prepare .env and API Key before running the script below
load_dotenv()

True

In [5]:
# !pip install langchain-chroma

In [6]:
# load doc
# loader = DirectoryLoader(
#     # '../../../lightfm/tests', 
#     '../../../SVD_Compression/llm_test/test', 
#     glob="**/*.py", 
#     show_progress=True, 
#     #use_multithreading=True,
#     loader_cls=PythonLoader
# )
# docs = loader.load()

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
# all_splits = text_splitter.split_documents(docs)

# # vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
# # retriever = vectorstore.as_retriever()
# # context = retriever.invoke("How many test functions are there?")

# # define prompt and chat
# prompt = ChatPromptTemplate.from_messages([
#     ("system", "Analyze the test functions from the codes below:\n\n{context}"),
#     MessagesPlaceholder(variable_name="messages")
# ])

# # chat = ChatOpenAI(model='gpt-4-turbo')
# chat = ChatOpenAI(model='gpt-4o', temperature=0.1)

# # combine prompt, chat and doc
# docs_chain = create_stuff_documents_chain(chat, prompt) 

# for chunk in docs_chain.stream({
#     "context": all_splits,
#     "messages": [
#         HumanMessage(content="How many test functions are there? Can you list them all?")
#     ],
# }):
#     print(chunk, end="", flush=True)

### Sample Checklist

In [2]:
checklist_sample = '''
1. Does it contain test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization?
2. Does it contain test case to verify that the output dimensions and values from model predictions match expected outcomes?
3. Does it contain test case to confirm the accuracy and correctness of evaluation metrics used within the system, ensuring that metrics such as precision, recall, AUC, etc., are computed correctly?
4. Does it contain test case to evaluate the model’s performance over training to identify potential overfitting? This could involve comparing training and validation loss.
5. Does it contain test case to define and enforce performance thresholds for crucial metrics to guarantee model performance?
'''

In [3]:
# load doc
loader = DirectoryLoader(
    #'../../../lightfm/tests', 
    '../../data/raw/openja/lightfm/tests',
    # '../../../SVD_Compression/llm_test/test', 
    glob="**/*.py", 
    show_progress=True, 
    #use_multithreading=True,
    loader_cls=PythonLoader
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(docs)

chat = ChatOpenAI(model='gpt-4o', 
                  temperature=0.1,
                 )

# define prompt and chat
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a senior machine learning engineer who specializes in performing Machine Learning system testing. Extract and analyze the test functions from the codes:\n\n{context}"),
    ("system", f"Here is the Machine Learning system testing checklist delimited by triple quotes '''{checklist_sample}'''"),
    MessagesPlaceholder(variable_name="messages")
])

# combine prompt, chat and doc
docs_chain = create_stuff_documents_chain(chat, prompt) 

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 2027.34it/s]


In [6]:
len(docs)

8

In [9]:
for chunk in docs_chain.stream({
    "context": docs,
    "messages": [
        HumanMessage(content="""
        Evaluate whether the codes has fulfilled the requirements and deliver a completion score. Do not include a summary evaluation.
        Desired format:
        Checklist Evaluation:
            Requirement Title:
            Requirement:
            Observation:
            Evaluation:
        Completion Score:
        """)
    ],
}):
    print(chunk, end="", flush=True)

Checklist Evaluation:
1. Requirement Title: Model Initialization
   Requirement: Does it contain test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization?
   Observation: The code includes multiple test cases that initialize models with various parameters and configurations, ensuring that models are loaded correctly without errors.
   Evaluation: Fulfilled

2. Requirement Title: Output Dimensions and Values
   Requirement: Does it contain test case to verify that the output dimensions and values from model predictions match expected outcomes?
   Observation: The code includes test cases such as `test_predict`, `test_predict_ranks`, and `test_get_representations` that verify the output dimensions and values from model predictions.
   Evaluation: Fulfilled

3. Requirement Title: Evaluation Metrics Accuracy
   Requirement: Does it contain test case to confirm the accuracy and correctness of evaluation metrics used within the syst

### Real Checklist

#### Checklist - Before Engineering

In [10]:
checklist_project = '''
'Every test function should have a clear, descriptive name',
'Each test should only test one scenario, meaning that in each test we should only use one set of mock data.',
'The assertions inside the tests should be narrow, meaning that when checking a complex object, any unrelated behavior should not be tested - Assert on only relevant behaviors.',
"The modifications and the assertions of an object's behavior in a single test should not be far away from each other.",
"Verify the function for loading data files load the file if the files exists with the right format, and doesn't load the file if it doesn't exist, and that it returns the expected results.",
'Verify the functions for saving data and figures can write as expected. They should check the if the write operation is successfully carried out, and the content is in an expected format.',
'Ensure that all data files are non-empty and contain the necessary data to proceed with the analysis or processing tasks.',
'Check that the data to be ingested is in the format expected by the processing algorithms (e.g., Is the CSV loaded as a `pd.DataFrame`? Is the image file loaded as a `np.array`, or a `PIL.Image`?) and that their structure matches the expected schema, any present.',
'Verify that the data files are free of unexpected null values and identify any outliers that may skew the results or affect the analysis. If null values are expected, it must be explicitly stated in the tests.',
'Test input and output so that a fixed input would get an expected output. One such test could be testing the output shap of the data after transformation. Ideally, each test should be limited to test just one verification.',
'Confirm that the model accepts the correct input shapes and types, and produces outputs of the expected shapes and types without errors.',
"If a parametric model is used during the training process, make sure that the model's weights are being updated correctly as expected per training iteration. If a non-parametric model is used, check if the data is fitted into the model.",
'Ensure that the shape of model output aligns with what is expected based the task of the model. For example, in classification task, the shape of the model output should be aligned with the number of labels in the dataset.',
'Verify that the output values are aligned with the task of the model. For example, a classification model would output probabilities which should sum to 1.',
"If the model relies on gradient descent for training, make sure a single gradient step on a batch of data yields a decrease in the model's training loss.",
'Confirm that there is no leakage between train/val/test or CV splits.',
'''

#### Checklist - After Engineering

In [2]:
checklist_project = """
Each test function should have a clear, descriptive name that accurately reflects the test's purpose and the specific functionality or scenario it examines.
Each test should focus on a single scenario, using only one set of mock data and testing one specific behavior or outcome to ensure clarity and isolate issues.
Assertions within tests should be focused and narrow. Ensure you are only testing relevant behaviors of complex objects and not including unrelated assertions.
Keep any modifications to objects and the corresponding assertions close together in your tests to maintain readability and clearly show the cause-and-effect relationship.
Ensure that data-loading functions correctly load files when they exist and match the expected format, handle non-existent files appropriately, and return the expected results.
Verify that functions for saving data and figures perform write operations correctly, checking that the operation succeeds and the content matches the expected format.
Ensure all data files are non-empty and contain the necessary data required for further analysis or processing tasks.
Verify that the data to be ingested matches the format expected by processing algorithms (like pd.DataFrame for CSVs or np.array for images) and adheres to the expected schema.
Check that data files are free from unexpected null values and identify any outliers that could affect the analysis. Tests should explicitly state if null values are part of expected data.
Test that a fixed input to a function or model produces the expected output, focusing on one verification per test to ensure predictable behavior.
Confirm that the model accepts inputs of the correct shapes and types and produces outputs that meet the expected shapes and types without any errors.
For parametric models, ensure that the model's weights update correctly per training iteration. For non-parametric models, verify that the data fits correctly into the model.
Ensure the shape of the model's output aligns with the expected structure based on the task, such as matching the number of labels in a classification task.
Verify that the model's output values are appropriate for its task, such as outputting probabilities that sum to 1 for classification tasks.
If using gradient descent for training, verify that a single gradient step on a batch of data results in a decrease in the model's training loss.
Confirm that there is no leakage of data between training, validation, and testing sets, or across cross-validation folds, to ensure the integrity of the splits.
"""

In [3]:
# load doc
loader = DirectoryLoader(
    '../../../lightfm/tests', 
    # '../../../SVD_Compression/llm_test/test', 
    glob="**/*.py", 
    show_progress=True, 
    #use_multithreading=True,
    loader_cls=PythonLoader
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(docs)

chat = ChatOpenAI(model='gpt-4o', 
                  temperature=0.1,
                 )

# define prompt and chat
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a senior machine learning engineer who specializes in performing Machine Learning system testing. Extract and analyze the test functions from the codes:\n\n{context}"),
    ("system", f"Here is the Machine Learning system testing checklist delimited by triple quotes '''{checklist_project}'''"),
    MessagesPlaceholder(variable_name="messages")
])

# combine prompt, chat and doc
docs_chain = create_stuff_documents_chain(chat, prompt) 

100%|████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 1980.20it/s]


In [4]:
for chunk in docs_chain.stream({
    "context": docs,
    "messages": [
        HumanMessage(content="""
        Your task is to answer each question in the checklist using only the provided test functions.
        If an answer to the question is provided, it must be annotated with a citation of the test function(s) in the Observation session.
        Then, decide the completion score in a fraction format based on your answers. The denominator should be the number of checklist items.
        Desired format:
            Checklist Evaluation:
                Requirement Title:
                Requirement:
                Observation:
                Evaluation: Satisfied/Partially Satisfied/Not Satisfied
            Completion Score: Number of satisfied requirements/Number of requirements
                Number of satisfied requirements:
                Number of partially satisfied requirements:
                Number of not satisfied requirements:
        """)
    ],
}):
    print(chunk, end="", flush=True)

Checklist Evaluation:

1. Requirement Title: Descriptive Test Function Names
   Requirement: Each test function should have a clear, descriptive name that accurately reflects the test's purpose and the specific functionality or scenario it examines.
   Observation: Test functions like `test_in_positives`, `test_movielens_accuracy`, `test_logistic_precision`, `test_bpr_precision`, etc., have clear and descriptive names.
   Evaluation: Satisfied

2. Requirement Title: Single Scenario Focus
   Requirement: Each test should focus on a single scenario, using only one set of mock data and testing one specific behavior or outcome to ensure clarity and isolate issues.
   Observation: Each test function focuses on a specific scenario, such as `test_movielens_accuracy` focusing on the accuracy of the Movielens dataset.
   Evaluation: Satisfied

3. Requirement Title: Focused Assertions
   Requirement: Assertions within tests should be focused and narrow. Ensure you are only testing relevant behav

In [7]:
for chunk in docs_chain.stream({
    "context": all_splits,
    "messages": [
        HumanMessage(content="""
        Your task is to answer each question in the checklist using only the provided test functions.
        If an answer to the question is provided, it must be annotated with a citation of the test function(s) in the Observation session.
        Then, decide the completion score in a fraction format based on your answers. The denominator should be the number of checklist items.
        Desired format:
            Checklist Evaluation:
                Requirement Title:
                Requirement:
                Observation:
                Evaluation: Satisfied/Partially Satisfied/Not Satisfied
            Completion Score: Number of satisfied requirements/Number of requirements
                Number of satisfied requirements:
                Number of partially satisfied requirements:
                Number of not satisfied requirements:
        """)
    ],
}):
    print(chunk, end="", flush=True)

Checklist Evaluation:

1. Requirement Title: Clear, Descriptive Test Names
   Requirement: Each test function should have a clear, descriptive name that accurately reflects the test's purpose and the specific functionality or scenario it examines.
   Observation: Test functions like `test_in_positives`, `test_movielens_accuracy`, `test_logistic_precision`, `test_bpr_precision`, etc., have clear and descriptive names.
   Evaluation: Satisfied

2. Requirement Title: Single Scenario Focus
   Requirement: Each test should focus on a single scenario, using only one set of mock data and testing one specific behavior or outcome to ensure clarity and isolate issues.
   Observation: Tests like `test_in_positives`, `test_movielens_accuracy`, and `test_logistic_precision` focus on specific scenarios and behaviors.
   Evaluation: Satisfied

3. Requirement Title: Focused Assertions
   Requirement: Assertions within tests should be focused and narrow. Ensure you are only testing relevant behaviors o