In [1]:
# from git import Repo
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser

In [2]:
import dotenv

dotenv.load_dotenv()

True

In [3]:
repo_path = "~/projects/personal/inferred"

loader = GenericLoader.from_filesystem(
    f"{repo_path}/backend",
    glob="**/*",
    suffixes=[".py"],
    parser=LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

documents = loader.load()
len(documents)

29

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, 
    chunk_size=2000, 
    chunk_overlap=200
)
texts = python_splitter.split_documents(documents)
len(texts)

42

In [9]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(disallowed_special=())
db = Chroma.from_documents(texts, embeddings)
retriever = db.as_retriever(
    search_type="mmr", # Also test "similarity"
    search_kwargs={"k": 8},
)

In [12]:
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain
llm = ChatOpenAI(model_name="gpt-4") 
memory = ConversationSummaryMemory(llm=llm, memory_key="chat_history", return_messages=True)
qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

In [13]:
question = "what are the endpoints available?"
result = qa(question)
result['answer']

'The available endpoints are:\n\n1. Base endpoint for SensorPredictionsViewSet (basename="sensor_predictions").\n2. "/simulation_models" for SimulationModelViewSet.\n3. "/dimensions" for DimensionViewSet.\n4. "/sensor_reads" for SensorReadViewSet (basename="sensor_reads").\n\nThese are all registered to the DefaultRouter in Django.'