In [73]:
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
#from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.document_loaders import DirectoryLoader, PythonLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.memory import ChatMessageHistory

load_dotenv()

True

In [79]:
def load_test_file(path):
    loader = PythonLoader(path)
    py = loader.load()
    py_splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_documents(py)
    return py_splits

def get_ai_response(message, py_splits, history=None, chain=None):
    if chain is None:
        prompt = ChatPromptTemplate.from_messages([
            ("system", "You are a coder analyzer. Please understand the code and answer the question as accurate as possible. Analyze the test functions from the codes below:\n\n{context}"),
            MessagesPlaceholder(variable_name="messages")
        ])
        chat = ChatOpenAI(model='gpt-4')

        chain = create_stuff_documents_chain(chat, prompt)
        
    if history is None:
        history = ChatMessageHistory()

    history.add_user_message(message)
    resp = chain.invoke({
        "context": py_splits, 
        "messages": history.messages
    })

    history.add_ai_message(resp)

    return resp, history, chain

In [80]:
py_splits = load_test_file('../../data/raw/openja/lightfm/tests/test_evaluation.py')

resp, history, chain = get_ai_response(
    message="How many functions are defined in the code? list them all",
    py_splits=py_splits
)

print(resp)

In the provided code, there are 11 functions defined. They are:

1. _generate_data
2. _precision_at_k
3. _recall_at_k
4. _auc
5. test_precision_at_k
6. test_precision_at_k_with_ties
7. test_recall_at_k
8. test_auc_score
9. test_intersections_check


In [82]:
resp, history, _ = get_ai_response(
    message="What is each of the functions doing?",
    py_splits=py_splits,
    history=history,
    chain=chain
)

print(resp)

1. `_generate_data`: This function generates a dataset where every user has interactions in both the training and the testing set.

2. `_precision_at_k`: This function computes the precision at k for a model's predictions. Precision at k is a measure of how many of the top k recommendations are relevant.

3. `_recall_at_k`: This function computes the recall at k for a model's predictions. Recall at k is a measure of how many of the relevant items are included in the top k recommendations.

4. `_auc`: This function computes the Area Under the ROC Curve (AUC) for a model's predictions. The ROC curve is a plot of the true positive rate against the false positive rate, and the AUC measures the entire two-dimensional area underneath this curve.

5. `test_precision_at_k`: This function tests the `precision_at_k` function to ensure it is working correctly.

6. `test_precision_at_k_with_ties`: This function tests the `precision_at_k` function when there are ties in the model's predictions.

7.

In [83]:
resp, history, _ = get_ai_response(
    message="Which of them are related to ML pipeline test cases?",
    py_splits=py_splits,
    history=history,
    chain=chain
)

print(resp)

The following functions are related to Machine Learning (ML) pipeline test cases:

1. `test_precision_at_k`: This function tests the precision at k for a model's predictions.

2. `test_precision_at_k_with_ties`: This function tests the precision at k when there are ties in the model's predictions.

3. `test_recall_at_k`: This function tests the recall at k for a model's predictions.

4. `test_auc_score`: This function tests the Area Under the ROC Curve (AUC) for a model's predictions.

5. `test_intersections_check`: This function tests the intersection check in the evaluation functions, which ensures that the training and testing sets do not have any interactions in common.


In [44]:
# load doc
loader = DirectoryLoader(
    '../../data/raw/openja/lightfm/tests', 
    #glob="**/*.py", 
    glob="**/*.py",
    show_progress=True, 
    #use_multithreading=True,
    loader_cls=PythonLoader
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
all_splits = text_splitter.split_documents(docs)

# # vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
# # retriever = vectorstore.as_retriever(k=4)
# # docs = retriever.invoke("How many functions are there?")

# # define prompt and chat
# prompt = ChatPromptTemplate.from_messages([
#     ("system", "You are a coder analyzer. Please understand the code and answer the question as accurate as possible. Analyze the test functions from the codes below:\n\n{context}"),
#     MessagesPlaceholder(variable_name="messages")
# ])
# chat = ChatOpenAI(model='gpt-4')

# # combine prompt, chat and doc
# docs_chain = create_stuff_documents_chain(chat, prompt)

# for chunk in docs_chain.stream({
#     "context": all_splits,
#     "messages": [
#         HumanMessage(content="How many test functions are explicitly defined in this code? List them all and explain what each of them are doing.")
#     ],
# }):
#     print(chunk, end="", flush=True)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 2016.92it/s]


In [71]:
loader = PythonLoader("../../data/raw/openja/lightfm/tests/test_evaluation.py")
py = loader.load()
py_splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_documents(py)
len(py_splits)

11

In [43]:
[doc.metadata for doc in docs]

[{'source': '../../data/raw/openja/lightfm/tests/test_fast_functions.py'},
 {'source': '../../data/raw/openja/lightfm/tests/test_movielens.py'},
 {'source': '../../data/raw/openja/lightfm/tests/test_datasets.py'},
 {'source': '../../data/raw/openja/lightfm/tests/test_cross_validation.py'},
 {'source': '../../data/raw/openja/lightfm/tests/__init__.py'},
 {'source': '../../data/raw/openja/lightfm/tests/test_evaluation_2.py'},
 {'source': '../../data/raw/openja/lightfm/tests/test_evaluation.py'},
 {'source': '../../data/raw/openja/lightfm/tests/test_data.py'},
 {'source': '../../data/raw/openja/lightfm/tests/test_api.py'}]

In [61]:
# define prompt and chat
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a coder analyzer. Please understand the code and answer the question as accurate as possible. Analyze the test functions from the codes below:\n\n{context}"),
    MessagesPlaceholder(variable_name="messages")
])
chat = ChatOpenAI(model='gpt-4-turbo')

# combine prompt, chat and doc
docs_chain = create_stuff_documents_chain(chat, prompt)

for chunk in docs_chain.stream({
    "context": sub_splits, #all_splits,
    "messages": [
        #HumanMessage(content="How many test functions are explicitly defined in this code? List them all and explain what each of them are doing.")
        HumanMessage(content="Which many functions are defined in the code? list them all")
    ],
}):
    print(chunk, end="", flush=True)

#[split.metadata for split in all_splits]

The code defines the following functions:

1. `_generate_data(num_users, num_items, density=0.1, test_fraction=0.2)`
2. `_precision_at_k(model, ground_truth, k, train=None, user_features=None, item_features=None)`
3. `_recall_at_k(model, ground_truth, k, train=None, user_features=None, item_features=None)`
4. `_auc(model, ground_truth, train=None, user_features=None, item_features=None)`
5. `test_precision_at_k()`
6. `test_precision_at_k_with_ties()`
7. `test_recall_at_k()`
8. `test_auc_score()`
9. `test_intersections_check()`

In [63]:
tmp = docs_chain.invoke({
    "context": sub_splits, #all_splits,
    "messages": [
        #HumanMessage(content="How many test functions are explicitly defined in this code? List them all and explain what each of them are doing.")
        HumanMessage(content="Which many functions are defined in the code? list them all")
    ],
})
print(tmp)

The code defines the following functions:

1. `_generate_data(num_users, num_items, density=0.1, test_fraction=0.2)`
2. `_precision_at_k(model, ground_truth, k, train=None, user_features=None, item_features=None)`
3. `_recall_at_k(model, ground_truth, k, train=None, user_features=None, item_features=None)`
4. `_auc(model, ground_truth, train=None, user_features=None, item_features=None)`
5. `test_precision_at_k()`
6. `test_precision_at_k_with_ties()`
7. `test_recall_at_k()`
8. `test_auc_score()`
9. `test_intersections_check()`


In [64]:
tmp2 = docs_chain.invoke({
    "context": sub_splits, 
    "messages": [
        HumanMessage(content="Which many functions are defined in the code? list them all"),
        AIMessage(content=tmp),
        HumanMessage(content="What is each of the functions doing?")
    ],
})
print(tmp2)

Here's a breakdown of what each function in the code does:

1. **`_generate_data(num_users, num_items, density=0.1, test_fraction=0.2)`**
   - This function generates synthetic data for testing in the form of user-item interaction matrices. It creates two matrices: `train` and `test`. Each user is assigned a set of items they interact with, split between the training and testing sets based on the specified `density` of interactions and the `test_fraction`.

2. **`_precision_at_k(model, ground_truth, k, train=None, user_features=None, item_features=None)`**
   - Computes the precision at k for a given model and ground truth data. It optionally takes into account training interactions (to exclude them from the precision calculation), and user and item features. This function manually computes the precision metric, which is used to validate the built-in precision computation of the LightFM library.

3. **`_recall_at_k(model, ground_truth, k, train=None, user_features=None, item_features=N

In [65]:
tmp3 = docs_chain.invoke({
    "context": sub_splits, 
    "messages": [
        HumanMessage(content="Which many functions are defined in the code? list them all"),
        AIMessage(content=tmp),
        HumanMessage(content="What is each of the functions doing?"),
        AIMessage(content=tmp2),
        HumanMessage(content="Which of them are related to ML pipeline test cases?"),
    ],
})
print(tmp3)

The functions related to ML (Machine Learning) pipeline test cases in the provided code are specifically designed to test the performance and behavior of the LightFM model under various conditions. These test case functions validate different aspects of the model's predictions, ensuring that the implementation of the evaluation metrics works as expected. Here are those functions:

1. **`test_precision_at_k()`**
   - Tests the precision at k metric implementation by comparing it against a custom calculation. It checks the metric under different configurations of k and examines how it handles training interactions.

2. **`test_precision_at_k_with_ties()`**
   - Focuses on testing the model's behavior in scenarios where there are ties in the predictions (i.e., all predicted scores are the same). This test is crucial for understanding how the model handles edge cases in ranking predictions.

3. **`test_recall_at_k()`**
   - Similar to the precision test, this function tests the recall at k

In [None]:
from langchain.memory import ChatMessageHistory
chat_history = ChatMessageHistory()

In [58]:
#[split.metadata['source'] for split in all_splits]

In [60]:
#[split.metadata['source'] for split in all_splits]

sub_splits = [
    split for split in all_splits
    if split.metadata['source'] == '../../data/raw/openja/lightfm/tests/test_evaluation.py'
]
sub_splits

[Document(page_content='import numpy as np\n\nimport pytest\n\nimport scipy.sparse as sp\n\nfrom sklearn.metrics import roc_auc_score\n\nfrom lightfm.lightfm import LightFM\nfrom lightfm import evaluation\n\n\ndef _generate_data(num_users, num_items, density=0.1, test_fraction=0.2):\n    # Generate a dataset where every user has interactions\n    # in both the train and the test set.\n\n    train = sp.lil_matrix((num_users, num_items), dtype=np.float32)\n    test = sp.lil_matrix((num_users, num_items), dtype=np.float32)\n\n    for user_id in range(num_users):\n        positives = np.random.choice(\n            num_items, size=int(density * num_items), replace=False\n        )\n\n        for item_id in positives[: int(test_fraction * len(positives))]:\n            test[user_id, item_id] = 1.0\n\n        for item_id in positives[int(test_fraction * len(positives)) :]:\n            train[user_id, item_id] = 1.0\n\n    return train.tocoo(), test.tocoo()', metadata={'source': '../../data/ra