In [4]:
from dotenv import load_dotenv

from langchain_community.document_loaders import PythonLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain

from langchain.memory import ChatMessageHistory
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

load_dotenv()

True

In [147]:
CHECKLIST_SAMPLE = '''
1. Does it contain test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization?
2. Does it contain test case to verify that the output dimensions and values from model predictions match expected outcomes?
3. Does it contain test case to confirm the accuracy and correctness of evaluation metrics used within the system, ensuring that metrics such as precision, recall, AUC, etc., are computed correctly?
4. Does it contain test case to evaluate the model’s performance over training to identify potential overfitting? This could involve comparing training and validation loss.
5. Does it contain test case to define and enforce performance thresholds for crucial metrics to guarantee model performance?
'''

def load_test_file(path):
    loader = PythonLoader(path)
    py = loader.load()
    py_splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_documents(py)
    return py_splits

def get_ai_response(message, py_splits, history=None, chain=None):
    if chain is None:
        prompt = ChatPromptTemplate.from_messages([
            ("system", "You are a coder analyzer. Please understand the code and answer the question as accurate as possible. Analyze the test functions from the codes below:\n\n{context}"),
            MessagesPlaceholder(variable_name="messages")
        ])
        chat = ChatOpenAI(model='gpt-4')

        chain = create_stuff_documents_chain(chat, prompt)
        
    if history is None:
        history = ChatMessageHistory()

    history.add_user_message(message)
    resp = chain.invoke({
        "context": py_splits, 
        "messages": history.messages
    })

    history.add_ai_message(resp)

    return resp, history, chain

def get_ai_responses(messages, py_splits, verbose=True):
    for i, msg in enumerate(messages):
        if verbose:
            print(f"Q: {msg}")
        
        if i == 0:
            resp, history, chain = get_ai_response(
                message=messages[i],
                py_splits=py_splits
            )
        else:
            resp, history, _ = get_ai_response(
                message=messages[i],
                py_splits=py_splits,
                history=history,
                chain=chain
            )
        
        if verbose:
            print(f"Response: {resp}")
            print()

    return history

def evaluate_ml_tests(py_splits, history, checklist):
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a senior machine learning engineer who specializes in performing Machine Learning system testing. Extract and analyze the test functions from the codes:\n\n{context}"),
        ("system", f"Here is the Machine Learning system testing checklist delimited by triple quotes '''{checklist}'''"),
        MessagesPlaceholder(variable_name="messages")
    ])
    chat = ChatOpenAI(model='gpt-4')
    chain = create_stuff_documents_chain(chat, prompt)

    history.add_user_message("""
        Evaluate whether the codes has fulfilled the requirements and deliver a completion score. Do not include a summary evaluation.
        Desired JSON format:
            {
                "Requirement Title":
                "Requirement":
                "Observation":
                "Related Functions": [ ... ]
                "Evaluation": (1 for Fulfilled / 0.5 for Partially fulfilled / 0 for Not fulfilled)
            }
    """)
    
    # for chunk in chain.stream({
    #     "context": py_splits,
    #     "messages": history.messages
    # }):
    #     print(chunk, end="", flush=True)

    report = chain.invoke({
        "context": py_splits, 
        "messages": history.messages
    })

    history.add_ai_message(report)

    return report, history

def extract_json(string, between='`'):
    start = string.index(between)+1
    end = string[::-1].index(between)-1
    mod_string = string[start:end].strip(between)
    return json.loads(mod_string)

In [148]:
py_splits = load_test_file('../../data/raw/openja/lightfm/tests/test_evaluation.py')

history = get_ai_responses(
    py_splits=py_splits,
    messages=[
        "How many functions are defined in the code? list them all",
        "What is each of the functions doing?",
        "Which of them are related to ML pipeline test cases?"
    ],
    verbose=False
)

report, history = evaluate_ml_tests(py_splits, history, CHECKLIST_SAMPLE)

In [45]:
print(report)

[
    {
        "Requirement Title": "Model Loading and Initialization",
        "Requirement": "Test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization",
        "Observation": "The code doesn't seem to have explicit test cases for model loading and initialization. The model is fit within each test case but there is no explicit verification of successful model setup or initialization.",
        "Related Functions": [],
        "Evaluation": 0
    },
    {
        "Requirement Title": "Output Dimensions and Values from Model Predictions",
        "Requirement": "Test case to verify that the output dimensions and values from model predictions match expected outcomes",
        "Observation": "The code does seem to test the output of model predictions within the test cases for precision, recall, and AUC. However, it does not explicitly verify the dimensions of these outputs.",
        "Related Functions": ["test_precision_at_k",

In [46]:
import json
json.loads(report)

[{'Requirement Title': 'Model Loading and Initialization',
  'Requirement': 'Test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization',
  'Observation': "The code doesn't seem to have explicit test cases for model loading and initialization. The model is fit within each test case but there is no explicit verification of successful model setup or initialization.",
  'Related Functions': [],
  'Evaluation': 0},
 {'Requirement Title': 'Output Dimensions and Values from Model Predictions',
  'Requirement': 'Test case to verify that the output dimensions and values from model predictions match expected outcomes',
  'Observation': 'The code does seem to test the output of model predictions within the test cases for precision, recall, and AUC. However, it does not explicitly verify the dimensions of these outputs.',
  'Related Functions': ['test_precision_at_k',
   'test_recall_at_k',
   'test_auc_score'],
  'Evaluation': 0.5},
 {'Re

In [15]:
evaluate_ml_tests(py_splits, history, CHECKLIST_SAMPLE)

Checklist Evaluation:

Requirement Title: Model Loading and Initialization
Requirement: Test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization.
Observation: The code includes functions to fit a model to the data, but there is no explicit test case to check if the model loads correctly without errors.
Related Functions: N/A
Evaluation: Not fulfilled

Requirement Title: Output Verification
Requirement: Test case to verify that the output dimensions and values from model predictions match expected outcomes.
Observation: The code includes several tests to verify that the outputs of the evaluation metrics match the expected outcomes. However, there isn't a specific test for verifying the dimensions and values of model predictions.
Related Functions: test_precision_at_k, test_precision_at_k_with_ties, test_recall_at_k, test_auc_score
Evaluation: Partially fulfilled

Requirement Title: Evaluation Metrics Accuracy
Requirement: Test 

In [47]:
from modules.repo import Repository
from collections import defaultdict
from tqdm import tqdm

repo = Repository("../../data/raw/openja/lightfm/")
test_files = repo.list_test_files()
test_files = test_files['Python']

reports = defaultdict()
for test_file in tqdm(test_files):
    print(f"Evaluating {test_file}")
    py_splits = load_test_file(test_file)

    history = get_ai_responses(
        py_splits=py_splits,
        messages=[
            "How many functions are defined in the code? list them all",
            "What is each of the functions doing?",
            "Which of them are related to ML pipeline test cases?"
        ],
        verbose=False
    )

    report, history = evaluate_ml_tests(py_splits, history, CHECKLIST_SAMPLE)
    print(report)
    print()

    reports[test_file] = {
        'report': report,
        'history': history
    }

  0%|                                                                                                                                                                                                                   | 0/5 [00:00<?, ?it/s]

Evaluating ../../data/raw/openja/lightfm/tests/test_datasets.py


 20%|████████████████████████████████████████▌                                                                                                                                                                  | 1/5 [00:47<03:09, 47.50s/it]

[
    {
        "Requirement Title": "Testing Model Data Loading",
        "Requirement": "The code should contain test cases to ensure models are loaded correctly without errors, checking for issues in model setup or initialization.",
        "Observation": "The code is testing data fetching methods, ensuring that the data loaded is as expected and correctly formatted.",
        "Related Functions": ["test_basic_fetching_movielens", "test_basic_fetching_stackexchange"],
        "Evaluation": 1
    },
    {
        "Requirement Title": "Output Dimensions and Values Verification",
        "Requirement": "The code should contain test cases to verify that the output dimensions and values from model predictions match expected outcomes.",
        "Observation": "The code tests the shape and type of the fetched data. However, it does not test any model predictions.",
        "Related Functions": ["test_basic_fetching_movielens", "test_basic_fetching_stackexchange"],
        "Evaluation": 0.5

 40%|█████████████████████████████████████████████████████████████████████████████████▏                                                                                                                         | 2/5 [01:44<02:38, 52.81s/it]

[
    {
        "Requirement Title": "Test Case for Data Splitting",
        "Requirement": "The code should contain a test case to ensure that the data splitting function correctly splits the dataset into disjoint training and test sets.",
        "Observation": "The function `test_random_train_test_split(test_percentage)` is a test case that checks the data splitting functionality. It asserts that the training and test sets are disjoint and that the size of the test set is as expected. Hence, the requirement is fulfilled.",
        "Related Functions": ["test_random_train_test_split(test_percentage)", "_assert_disjoint(x, y)"],
        "Evaluation": 1
    },
    {
        "Requirement Title": "Test Case for Model Loading and Initialization",
        "Requirement": "The code should contain a test case to ensure that models are loaded correctly without errors, checking for issues in model setup or initialization.",
        "Observation": "The provided code does not contain any test cas

 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                 | 3/5 [02:43<01:51, 55.72s/it]

Here is the evaluation in the desired JSON format:

```
[
    {
        "Requirement Title": "Test Case for Model Loading and Initialization",
        "Requirement": "Test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization.",
        "Observation": "The code does not have a specific test case to check the loading and initialization of the models.",
        "Related Functions": [],
        "Evaluation": 0
    },
    {
        "Requirement Title": "Test Case for Output Dimensions and Values",
        "Requirement": "Test case to verify that the output dimensions and values from model predictions match expected outcomes.",
        "Observation": "The code has test cases to validate the output from the model match the expected outcomes in precision, recall, and AUC.",
        "Related Functions": ["test_precision_at_k", "test_recall_at_k", "test_auc_score"],
        "Evaluation": 1
    },
    {
        "Requirement Title": "Test 

 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 4/5 [03:25<00:50, 50.47s/it]

[
    {
        "Requirement Title": "Test Fitting",
        "Requirement": "A test case to ensure models are loaded correctly and the fitting method of the Dataset class works correctly.",
        "Observation": "The function `test_fitting` successfully tests the fitting method of the Dataset class and the shapes of the interaction, user features, and item features are as expected.",
        "Related Functions": ["test_fitting"],
        "Evaluation": 1
    },
    {
        "Requirement Title": "Test Fitting with No Identity",
        "Requirement": "A test case to ensure models are loaded correctly and the fitting method of the Dataset class works correctly without identity features.",
        "Observation": "The function `test_fitting_no_identity` successfully tests the fitting method of the Dataset class when identity features are turned off.",
        "Related Functions": ["test_fitting_no_identity"],
        "Evaluation": 1
    },
    {
        "Requirement Title": "Test Exceptio

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:54<00:00, 58.86s/it]

[
    {
        "Requirement Title": "Testing Model Initialization",
        "Requirement": "The code should contain a test case to ensure that models are loaded correctly without errors, checking for issues in model setup or initialization.",
        "Observation": "The code includes several tests such as test_return_self(), test_param_sanity(), test_sklearn_api(), test_predict_not_fitted() that verify the model initialization and setup.",
        "Related Functions": ["test_return_self()", "test_param_sanity()", "test_sklearn_api()", "test_predict_not_fitted()"],
        "Evaluation": 1
    },
    {
        "Requirement Title": "Testing Model Prediction",
        "Requirement": "The code should contain a test case to verify that the output dimensions and values from model predictions match expected outcomes.",
        "Observation": "The test_predict() function checks the model's ability to predict scores for users and items. The test_predict_ranks() function tests the model's abilit




In [42]:
print(reports['../../data/raw/openja/lightfm/tests/test_datasets.py']['report'])

Checklist Evaluation:

1. Requirement Title: Model Loading and Initialization
   Requirement: Test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization.
   Observation: The code does not involve any model loading or initialization. It tests data fetching and data structure.
   Related Functions: None
   Evaluation: Not fulfilled

2. Requirement Title: Output Dimensions and Values
   Requirement: Test case to verify that the output dimensions and values from model predictions match expected outcomes.
   Observation: The code verifies the dimensions of the fetched data. However, there are no model predictions involved in the test.
   Related Functions: test_basic_fetching_movielens, test_basic_fetching_stackexchange
   Evaluation: Partially fulfilled

3. Requirement Title: Evaluation Metrics
   Requirement: Test case to confirm the accuracy and correctness of evaluation metrics used within the system, ensuring that metrics such a

In [64]:
print(list(reports.values())[2]['report'])

Here is the evaluation in the desired JSON format:

```
[
    {
        "Requirement Title": "Test Case for Model Loading and Initialization",
        "Requirement": "Test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization.",
        "Observation": "The code does not have a specific test case to check the loading and initialization of the models.",
        "Related Functions": [],
        "Evaluation": 0
    },
    {
        "Requirement Title": "Test Case for Output Dimensions and Values",
        "Requirement": "Test case to verify that the output dimensions and values from model predictions match expected outcomes.",
        "Observation": "The code has test cases to validate the output from the model match the expected outcomes in precision, recall, and AUC.",
        "Related Functions": ["test_precision_at_k", "test_recall_at_k", "test_auc_score"],
        "Evaluation": 1
    },
    {
        "Requirement Title": "Test 

In [139]:
eval = []
for r in reports.values():
    try:
        js = json.loads(r['report'])
    except:
        js = extract_json(r['report'])

    eval += [{
        'index': i,
        'Requirement Title': d['Requirement Title'],
        'Requirement': d['Requirement'],
        'Evaluation': d['Evaluation']
    } for i, d in enumerate(js)]

eval

[{'index': 0,
  'Requirement Title': 'Testing Model Data Loading',
  'Requirement': 'The code should contain test cases to ensure models are loaded correctly without errors, checking for issues in model setup or initialization.',
  'Evaluation': 1},
 {'index': 1,
  'Requirement Title': 'Output Dimensions and Values Verification',
  'Requirement': 'The code should contain test cases to verify that the output dimensions and values from model predictions match expected outcomes.',
  'Evaluation': 0.5},
 {'index': 2,
  'Requirement Title': 'Evaluation Metrics Accuracy',
  'Requirement': 'The code should contain test cases to confirm the accuracy and correctness of evaluation metrics used within the system.',
  'Evaluation': 0},
 {'index': 3,
  'Requirement Title': 'Model Performance Over Training',
  'Requirement': 'The code should contain test cases to evaluate the model’s performance over training to identify potential overfitting.',
  'Evaluation': 0},
 {'index': 4,
  'Requirement Title

In [141]:
pd.DataFrame(eval).query('index == 0')

Unnamed: 0,index,Requirement Title,Requirement,Evaluation
0,0,Testing Model Data Loading,The code should contain test cases to ensure m...,1.0
5,0,Test Case for Data Splitting,The code should contain a test case to ensure ...,1.0
11,0,Test Case for Model Loading and Initialization,Test case to ensure models are loaded correctl...,0.0
16,0,Test Fitting,A test case to ensure models are loaded correc...,1.0
20,0,Testing Model Initialization,The code should contain a test case to ensure ...,1.0


In [142]:
pd.DataFrame(eval).query('index == 1')

Unnamed: 0,index,Requirement Title,Requirement,Evaluation
1,1,Output Dimensions and Values Verification,The code should contain test cases to verify t...,0.5
6,1,Test Case for Model Loading and Initialization,The code should contain a test case to ensure ...,0.0
12,1,Test Case for Output Dimensions and Values,Test case to verify that the output dimensions...,1.0
17,1,Test Fitting with No Identity,A test case to ensure models are loaded correc...,1.0
21,1,Testing Model Prediction,The code should contain a test case to verify ...,1.0


In [144]:
print(list(reports.values())[0]['report'])

[
    {
        "Requirement Title": "Testing Model Data Loading",
        "Requirement": "The code should contain test cases to ensure models are loaded correctly without errors, checking for issues in model setup or initialization.",
        "Observation": "The code is testing data fetching methods, ensuring that the data loaded is as expected and correctly formatted.",
        "Related Functions": ["test_basic_fetching_movielens", "test_basic_fetching_stackexchange"],
        "Evaluation": 1
    },
    {
        "Requirement Title": "Output Dimensions and Values Verification",
        "Requirement": "The code should contain test cases to verify that the output dimensions and values from model predictions match expected outcomes.",
        "Observation": "The code tests the shape and type of the fetched data. However, it does not test any model predictions.",
        "Related Functions": ["test_basic_fetching_movielens", "test_basic_fetching_stackexchange"],
        "Evaluation": 0.5

In [143]:
print(list(reports.values())[1]['report'])

[
    {
        "Requirement Title": "Test Case for Data Splitting",
        "Requirement": "The code should contain a test case to ensure that the data splitting function correctly splits the dataset into disjoint training and test sets.",
        "Observation": "The function `test_random_train_test_split(test_percentage)` is a test case that checks the data splitting functionality. It asserts that the training and test sets are disjoint and that the size of the test set is as expected. Hence, the requirement is fulfilled.",
        "Related Functions": ["test_random_train_test_split(test_percentage)", "_assert_disjoint(x, y)"],
        "Evaluation": 1
    },
    {
        "Requirement Title": "Test Case for Model Loading and Initialization",
        "Requirement": "The code should contain a test case to ensure that models are loaded correctly without errors, checking for issues in model setup or initialization.",
        "Observation": "The provided code does not contain any test cas

In [84]:
report, history, _ = get_ai_response(
    message="Remove the ``` and the sentence before it.",
    py_splits=load_test_file(list(reports.keys())[2]),
    history=list(reports.values())[2]['history']
)

print(report)

Sure, here is the content:

"[{\"Requirement Title\": \"Test Case for Model Loading and Initialization\", \"Requirement\": \"Test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization.\", \"Observation\": \"The code does not have a specific test case to check the loading and initialization of the models.\", \"Related Functions\": [], \"Evaluation\": 0}, {\"Requirement Title\": \"Test Case for Output Dimensions and Values\", \"Requirement\": \"Test case to verify that the output dimensions and values from model predictions match expected outcomes.\", \"Observation\": \"The code has test cases to validate the output from the model match the expected outcomes in precision, recall, and AUC.\", \"Related Functions\": [\"test_precision_at_k\", \"test_recall_at_k\", \"test_auc_score\"], \"Evaluation\": 1}, {\"Requirement Title\": \"Test Case for Accuracy of Evaluation Metrics\", \"Requirement\": \"Test case to confirm the accuracy and 

In [127]:
extract_json(list(reports.values())[2]['report'])

[{'Requirement Title': 'Test Case for Model Loading and Initialization',
  'Requirement': 'Test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization.',
  'Observation': 'The code does not have a specific test case to check the loading and initialization of the models.',
  'Related Functions': [],
  'Evaluation': 0},
 {'Requirement Title': 'Test Case for Output Dimensions and Values',
  'Requirement': 'Test case to verify that the output dimensions and values from model predictions match expected outcomes.',
  'Observation': 'The code has test cases to validate the output from the model match the expected outcomes in precision, recall, and AUC.',
  'Related Functions': ['test_precision_at_k',
   'test_recall_at_k',
   'test_auc_score'],
  'Evaluation': 1},
 {'Requirement Title': 'Test Case for Accuracy of Evaluation Metrics',
  'Requirement': 'Test case to confirm the accuracy and correctness of evaluation metrics used within th

In [116]:
json.loads(list(reports.values())[2]['report'][52:].strip('`'))

[{'Requirement Title': 'Test Case for Model Loading and Initialization',
  'Requirement': 'Test case to ensure models are loaded correctly without errors, checking for issues in model setup or initialization.',
  'Observation': 'The code does not have a specific test case to check the loading and initialization of the models.',
  'Related Functions': [],
  'Evaluation': 0},
 {'Requirement Title': 'Test Case for Output Dimensions and Values',
  'Requirement': 'Test case to verify that the output dimensions and values from model predictions match expected outcomes.',
  'Observation': 'The code has test cases to validate the output from the model match the expected outcomes in precision, recall, and AUC.',
  'Related Functions': ['test_precision_at_k',
   'test_recall_at_k',
   'test_auc_score'],
  'Evaluation': 1},
 {'Requirement Title': 'Test Case for Accuracy of Evaluation Metrics',
  'Requirement': 'Test case to confirm the accuracy and correctness of evaluation metrics used within th