In [1]:
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_community.document_loaders import DirectoryLoader, PythonLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain

load_dotenv()

True

In [2]:
# load doc
loader = DirectoryLoader(
    '../../data/raw/openja/lightfm/tests', 
    glob="**/*.py", 
    show_progress=True, 
    #use_multithreading=True,
    loader_cls=PythonLoader
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(docs)

vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever(k=4)
docs = retriever.invoke("How many test functions are there?")

# define prompt and chat
prompt = ChatPromptTemplate.from_messages([
    ("system", "Analyze the test functions from the codes below:\n\n{context}"),
    MessagesPlaceholder(variable_name="messages")
])
chat = ChatOpenAI(model='gpt-4')

# combine prompt, chat and doc
docs_chain = create_stuff_documents_chain(chat, prompt)

for chunk in docs_chain.stream({
    "context": docs,
    "messages": [
        HumanMessage(content="How many test functions are there? Can you list them all?")
    ],
}):
    print(chunk, end="", flush=True)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 5318.50it/s]


There are three test functions in the provided code. They are:

1. test_basic_fetching_stackexchange
2. test_bpr_precision
3. test_bpr_precision_multithreaded

### wrapped into functions

In [1]:
from dotenv import load_dotenv

from langchain_community.document_loaders import PythonLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain

from langchain.memory import ChatMessageHistory
from langchain_core.messages import AIMessage, HumanMessage

load_dotenv()

True

In [2]:
CHECKLIST_SAMPLE = '''
1. Does it contain test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization?
2. Does it contain test case to verify that the output dimensions and values from model predictions match expected outcomes?
3. Does it contain test case to confirm the accuracy and correctness of evaluation metrics used within the system, ensuring that metrics such as precision, recall, AUC, etc., are computed correctly?
4. Does it contain test case to evaluate the model’s performance over training to identify potential overfitting? This could involve comparing training and validation loss.
5. Does it contain test case to define and enforce performance thresholds for crucial metrics to guarantee model performance?
'''

def load_test_file(path):
    loader = PythonLoader(path)
    py = loader.load()
    py_splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_documents(py)
    return py_splits

def get_ai_response(message, py_splits, history=None, chain=None):
    if chain is None:
        prompt = ChatPromptTemplate.from_messages([
            ("system", "You are a coder analyzer. Please understand the code and answer the question as accurate as possible. Analyze the test functions from the codes below:\n\n{context}"),
            MessagesPlaceholder(variable_name="messages")
        ])
        chat = ChatOpenAI(model='gpt-4')

        chain = create_stuff_documents_chain(chat, prompt)
        
    if history is None:
        history = ChatMessageHistory()

    history.add_user_message(message)
    resp = chain.invoke({
        "context": py_splits, 
        "messages": history.messages
    })

    history.add_ai_message(resp)

    return resp, history, chain

def get_ai_responses(messages, py_splits, verbose=True):
    for i, msg in enumerate(messages):
        if verbose:
            print(f"Q: {msg}")
        
        if i == 0:
            resp, history, chain = get_ai_response(
                message=messages[i],
                py_splits=py_splits
            )
        else:
            resp, history, _ = get_ai_response(
                message=messages[i],
                py_splits=py_splits,
                history=history,
                chain=chain
            )
        
        if verbose:
            print(f"Response: {resp}")
            print()

    return history

def evaluate_ml_tests(py_splits, history, checklist):
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a senior machine learning engineer who specializes in performing Machine Learning system testing. Extract and analyze the test functions from the codes:\n\n{context}"),
        ("system", f"Here is the Machine Learning system testing checklist delimited by triple quotes '''{checklist}'''"),
        MessagesPlaceholder(variable_name="messages")
    ])
    chat = ChatOpenAI(model='gpt-4')
    chain = create_stuff_documents_chain(chat, prompt)

    history.add_user_message("""
        Evaluate whether the codes has fulfilled the requirements and deliver a completion score. Do not include a summary evaluation.
        Desired JSON format:
            {
                "Requirement Title":
                "Requirement":
                "Observation":
                "Related Functions": [ ... ]
                "Evaluation": (1 for Fulfilled / 0.5 for Partially fulfilled / 0 for Not fulfilled)
            }
    """)
    
    # for chunk in chain.stream({
    #     "context": py_splits,
    #     "messages": history.messages
    # }):
    #     print(chunk, end="", flush=True)

    report = chain.invoke({
        "context": py_splits, 
        "messages": history.messages
    })

    history.add_ai_message(report)

    return report, history

In [3]:
py_splits = load_test_file('../../data/raw/openja/lightfm/tests/test_evaluation.py')

history = get_ai_responses(
    py_splits=py_splits,
    messages=[
        "How many functions are defined in the code? list them all",
        "What is each of the functions doing?",
        "Which of them are related to ML pipeline test cases?"
    ],
    verbose=False
)

report, history = evaluate_ml_tests(py_splits, history, CHECKLIST_SAMPLE)

In [4]:
print(report)

[
    {
        "Requirement Title": "Model Loading and Initialization",
        "Requirement": "Test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization.",
        "Observation": "The code doesn't seem to have specific test cases for model loading and initialization.",
        "Related Functions": [],
        "Evaluation": 0
    },
    {
        "Requirement Title": "Output Validation",
        "Requirement": "Test case to verify that the output dimensions and values from model predictions match expected outcomes.",
        "Observation": "The code includes tests checking the outputs of the model predictions and comparing them with expected values.",
        "Related Functions": ["test_precision_at_k", "test_recall_at_k", "test_auc_score"],
        "Evaluation": 1
    },
    {
        "Requirement Title": "Evaluation Metrics Correctness",
        "Requirement": "Test case to confirm the accuracy and correctness of evaluation 

### run through test files in a repo

In [5]:
from modules.repo import Repository # under ~/src/test_creation/: ln -s ../code_analyzer/modules .
from collections import defaultdict
from tqdm import tqdm

repo = Repository("../../data/raw/openja/lightfm/")
test_files = repo.list_test_files()
test_files = test_files['Python']

reports = defaultdict()
for test_file in tqdm(test_files):
    print(f"Evaluating {test_file}")
    py_splits = load_test_file(test_file)

    history = get_ai_responses(
        py_splits=py_splits,
        messages=[
            "How many functions are defined in the code? list them all",
            "What is each of the functions doing?",
            "Which of them are related to ML pipeline test cases?"
        ],
        verbose=False
    )

    report, history = evaluate_ml_tests(py_splits, history, CHECKLIST_SAMPLE)
    print(report)
    print()

    reports[test_file] = {
        'report': report,
        'history': history
    }

  0%|                                                                                                                                                                                                                   | 0/5 [00:00<?, ?it/s]

Evaluating ../../data/raw/openja/lightfm/tests/test_datasets.py


 20%|████████████████████████████████████████▌                                                                                                                                                                  | 1/5 [00:59<03:58, 59.62s/it]

[
    {
        "Requirement Title": "Model Loading and Initialization",
        "Requirement": "The test case should ensure models are loaded correctly without errors, checking for issues in model setup or initialization.",
        "Observation": "The provided code does not include any explicit model loading or initialization; it is primarily focused on fetching and structuring datasets.",
        "Related Functions": [],
        "Evaluation": 0
    },
    {
        "Requirement Title": "Model Prediction Outcomes",
        "Requirement": "The test case should verify that the output dimensions and values from model predictions match expected outcomes.",
        "Observation": "The provided code does not include any model prediction functionality, hence there is no test for verifying output dimensions and values from model predictions.",
        "Related Functions": [],
        "Evaluation": 0
    },
    {
        "Requirement Title": "Evaluation Metrics",
        "Requirement": "The te

 40%|█████████████████████████████████████████████████████████████████████████████████▏                                                                                                                         | 2/5 [01:56<02:53, 57.97s/it]

[    
    {
        "Requirement Title": "Model Loading Test",
        "Requirement": "Test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization.",
        "Observation": "The provided code does not perform any tests related to model loading or initialization.",
        "Related Functions": [],
        "Evaluation": 0
    },
    {
        "Requirement Title": "Output Verification Test",
        "Requirement": "Test case to verify that the output dimensions and values from model predictions match expected outcomes.",
        "Observation": "The provided code does not perform any tests related to verifying the output dimensions and values from model predictions.",
        "Related Functions": [],
        "Evaluation": 0
    },
    {
        "Requirement Title": "Evaluation Metrics Accuracy Test",
        "Requirement": "Test case to confirm the accuracy and correctness of evaluation metrics used within the system.",
        "Obse

 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                 | 3/5 [03:11<02:11, 65.80s/it]

[
    {
        "Requirement Title": "Loading and Initializing Models",
        "Requirement": "The system should contain a test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization.",
        "Observation": "The system initializes the LightFM model in every testing function and there are no explicit tests for model loading or initialization.",
        "Related Functions": ["test_precision_at_k", "test_precision_at_k_with_ties", "test_recall_at_k", "test_auc_score", "test_intersections_check"],
        "Evaluation": 0.5
    },
    {
        "Requirement Title": "Output Dimensions and Values",
        "Requirement": "The system should contain a test case to verify that the output dimensions and values from model predictions match expected outcomes.",
        "Observation": "The system performs checks on the output of evaluation metrics in every testing function.",
        "Related Functions": ["test_precision_at_k", "test_precis

 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 4/5 [04:02<00:59, 59.87s/it]

[
    {
        "Requirement Title": "ML Pipeline Test Case - Dataset Fitting",
        "Requirement": "A test case should be in place to ensure the dataset fitting process works as expected.",
        "Observation": "The functions 'test_fitting' and 'test_fitting_no_identity' tests the dataset fitting process for different scenarios. They check whether the interactions and features shapes are as expected.",
        "Related Functions": ["test_fitting", "test_fitting_no_identity"],
        "Evaluation": 1
    },
    {
        "Requirement Title": "ML Pipeline Test Case - Exception Handling",
        "Requirement": "A test case should be in place to ensure the ML pipeline can handle unexpected inputs or situations.",
        "Observation": "The function 'test_exceptions' checks if ValueError is raised when trying to build interactions with values outside the specified range. The system recovers without crashing, which is a good practice in ML pipelines.",
        "Related Functions": ["

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [05:31<00:00, 66.28s/it]

Here is the evaluation of the code in the desired JSON format:

```
[
    {
        "Requirement Title": "Loading and Initialization",
        "Requirement": "The model should load and initialize without errors",
        "Observation": "The code contains tests for model loading and initialization. It checks for cases where input data may be empty or have different formats and datatypes.",
        "Related Functions": ["test_empty_matrix", "test_matrix_types", "test_input_dtypes"],
        "Evaluation": 1
    },
    {
        "Requirement Title": "Testing Predictions",
        "Requirement": "The output dimensions and values from model predictions match expected outcomes",
        "Observation": "The code contains tests to check the model's predict function under different conditions including not being fitted, overflow, and standard conditions.",
        "Related Functions": ["test_predict", "test_overflow_predict", "test_predict_not_fitted"],
        "Evaluation": 1
    },
    {
     


