In [1]:
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

load_dotenv()

True

### basic chatbot

In [50]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer all questions to the best of your ability."),
    MessagesPlaceholder(variable_name="messages")
])

chat = ChatOpenAI(model='gpt-4')

chain = prompt | chat

chain.invoke({
    "messages": [
        HumanMessage("Translate this sentence from English to French: I love programming."),
        AIMessage(content="J'adore la programmation."),
        HumanMessage(content="What did you just say?"),
    ],
})

AIMessage(content='I said "I love programming" in French, which is "J\'adore la programmation."', response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 61, 'total_tokens': 82}, 'model_name': 'gpt-4', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-53b73ad2-9b05-46a4-b643-5f6da841a6d8-0')

In [56]:
from langchain.memory import ChatMessageHistory

chat_history = ChatMessageHistory()

chat_history.add_user_message("hi!")
chat_history.add_ai_message("whats up?")
chat_history.add_user_message(
    "Translate this sentence from English to French: I love programming."
)

chat_history.add_ai_message(
    chain.invoke({"messages": chat_history.messages})
)

chat_history.add_user_message("What did you just say?")

chain.invoke({"messages": chat_history.messages})

AIMessage(content='I just translated "I love programming" to French, which is "J\'adore la programmation."', response_metadata={'token_usage': {'completion_tokens': 22, 'prompt_tokens': 75, 'total_tokens': 97}, 'model_name': 'gpt-4', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-e6cea2bd-cc9e-488d-80f9-b6d552472b8d-0')

In [61]:
for chunk in chain.stream({"messages": chat_history.messages}):
    print(chunk.content, end="", flush=True)

I translated "I love programming" into French, which is "J'adore la programmation."

### load documents

In [72]:
from langchain_community.document_loaders import DirectoryLoader, PythonLoader
loader = DirectoryLoader(
    '../../data/raw/openja/lightfm/tests', 
    glob="**/*.py", 
    show_progress=True, 
    #use_multithreading=True,
    loader_cls=PythonLoader
)

docs = loader.load()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 2237.86it/s]


In [75]:
!ls ../../data/raw/openja/lightfm/tests

__init__.py              test_data.py             test_fast_functions.py
test_api.py              test_datasets.py         test_movielens.py
test_cross_validation.py test_evaluation.py


In [83]:
print(docs[0].page_content)
docs[0].metadata

import numpy as np

import scipy.sparse as sp


from lightfm import _lightfm_fast


def test_in_positives():

    mat = sp.csr_matrix(np.array([[0, 1], [1, 0]])).astype(np.float32)

    assert not _lightfm_fast.__test_in_positives(0, 0, _lightfm_fast.CSRMatrix(mat))
    assert _lightfm_fast.__test_in_positives(0, 1, _lightfm_fast.CSRMatrix(mat))

    assert _lightfm_fast.__test_in_positives(1, 0, _lightfm_fast.CSRMatrix(mat))
    assert not _lightfm_fast.__test_in_positives(1, 1, _lightfm_fast.CSRMatrix(mat))



{'source': '../../data/raw/openja/lightfm/tests/test_fast_functions.py'}

In [85]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(docs)
len(all_splits)

119

In [97]:
#!pip install langchain_chroma

In [98]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

In [105]:
# k is the number of chunks to retrieve
retriever = vectorstore.as_retriever(k=4)

docs = retriever.invoke("How many test functions are there?")

docs

[Document(page_content='@pytest.mark.skip(reason="Runs out of memory in CI")\ndef test_basic_fetching_stackexchange():\n\n    test_fractions = (0.2, 0.5, 0.6)\n\n    for test_fraction in test_fractions:\n        data = fetch_stackexchange(\n            "crossvalidated",\n            min_training_interactions=0,\n            test_set_fraction=test_fraction,\n        )\n\n        train = data["train"]\n        test = data["test"]\n\n        assert isinstance(train, sp.coo_matrix)\n        assert isinstance(test, sp.coo_matrix)', metadata={'source': '../../data/raw/openja/lightfm/tests/test_datasets.py'}),
 Document(page_content='assert train.shape == test.shape\n\n        frac = float(test.getnnz()) / (train.getnnz() + test.getnnz())\n        assert abs(frac - test_fraction) < 0.01\n\n    for dataset in ("crossvalidated", "stackoverflow"):', metadata={'source': '../../data/raw/openja/lightfm/tests/test_datasets.py'}),
 Document(page_content='assert test.nnz / float(data.nnz) == test_perc

### combine chatbot and document loader

In [113]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_community.document_loaders import DirectoryLoader, PythonLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain

# load doc
loader = DirectoryLoader(
    '../../data/raw/openja/lightfm/tests', 
    glob="**/*.py", 
    show_progress=True, 
    #use_multithreading=True,
    loader_cls=PythonLoader
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(docs)

vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever(k=4)
docs = retriever.invoke("How many test functions are there?")

# define prompt and chat
prompt = ChatPromptTemplate.from_messages([
    ("system", "Analyze the test functions from the codes below:\n\n{context}"),
    MessagesPlaceholder(variable_name="messages")
])
chat = ChatOpenAI(model='gpt-4')

# combine prompt, chat and doc
docs_chain = create_stuff_documents_chain(chat, prompt)

for chunk in docs_chain.stream({
    "context": docs,
    "messages": [
        HumanMessage(content="How many test functions are there? Can you list them all?")
    ],
}):
    print(chunk, end="", flush=True)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 1811.99it/s]


There are three test functions in the given code snippets. However, there are duplicate appearances of each function, so if we consider unique function names, there are only three. The test functions are:

1. test_bpr_precision()
2. test_bpr_precision_multithreaded()
3. test_warp_kos_precision()

In [40]:
llm = ChatOpenAI(model='gpt-4')
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a ChatGPT 4.0, a large language model trained by OpenAI. Answer as concisely as possible.\nKnowledge cutoff: 2023-12-31\nCurrent date: {CurrentDate}"),
    ("user", "{input}")
])

output_parser = StrOutputParser()

chain = prompt | llm | output_parser

output_str = chain.invoke({
    "input": "can you write me a sample python code, please?",
    "CurrentDate": "2024-05-01"
})

type(output_str)

str

In [46]:
from langchain_core.messages import HumanMessage
llm.invoke([HumanMessage('Hi')])

AIMessage(content='Hello! How can I assist you today?', response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 8, 'total_tokens': 17}, 'model_name': 'gpt-4', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-e1d64633-08b6-48d4-86aa-353603a55bcc-0')

In [48]:
llm.invoke('Hi')

AIMessage(content='Hello! How can I assist you today?', response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 8, 'total_tokens': 17}, 'model_name': 'gpt-4', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-e8064db8-a961-4bb8-b3db-f07b09dc7189-0')

In [9]:
# def main():
#     llm = ChatOpenAI()

#     prompt = ChatPromptTemplate.from_messages([
#         ("system", "You are a world class Machine Learning engineer."),
#         ("user", "{input}")
#     ])

#     output_parser = StrOutputParser()

#     chain = prompt | llm | output_parser

#     output_str = chain.invoke({"input": "Hello LLM World! I can't wait to use you to write a test suite for my ML pipeline!"})

#     print(output_str)


# if __name__ == "__main__":
#     main()
