In [2]:
# from git import Repo
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser

In [3]:
import dotenv

dotenv.load_dotenv()

True

In [4]:
repo_path = "~/projects/personal/inferred"

loader = GenericLoader.from_filesystem(
    f"{repo_path}/backend",
    glob="**/*",
    suffixes=[".py"],
    parser=LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

documents = loader.load()
len(documents)

29

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, 
    chunk_size=2000, 
    chunk_overlap=200
)
texts = python_splitter.split_documents(documents)
len(texts)

42

In [6]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(disallowed_special=())
db = Chroma.from_documents(texts, embeddings)
retriever = db.as_retriever(
    search_type="mmr", # Also test "similarity"
    search_kwargs={"k": 8},
)

In [7]:
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain
llm = ChatOpenAI(model_name="gpt-4") 
memory = ConversationSummaryMemory(llm=llm, memory_key="chat_history", return_messages=True)
qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

In [8]:
# question = "what are the endpoints available?"
# result = qa(question)
# result['answer']

In [9]:
prompt = "Write a fancy, funny, but not cringe readme for this project. explain steps to setup and run the project. explain that it is a project for my masters degree thesis. explain the stack. topic of the thesis is 'Dashboard for Assessing the Adequacy of Simulation Models in Digital Twin Applications'"
result = qa(prompt)
result['answer']

'# Project Inferred: A Masterpiece for Masters \n\nHello there, fellow tech aficionado!\n\nWelcome to the GitHub repository of *Project Inferred* — the pièce de résistance of my Master\'s degree thesis. This project is a perfect blend of technology, humor, and more importantly, top-notch coding practices, all served on a platter of Django and Python. So, grab a cup of your favorite beverage, and let\'s dive right in!\n\n## Introduction\n\nProject Inferred is a dashboard for assessing the adequacy of simulation models in digital twin applications. It\'s like the Sherlock Holmes of the digital universe, always on the lookout for discrepancies in digital twins. If you are wondering, "Digital what...?", let me explain. Digital twins are virtual replicas of physical entities that data scientists and IT pros can use to run simulations before actual devices are built and deployed. And our project is kind of a detective that ensures these twins are not going rogue. Cool, right?\n\n## Tech Stac

In [13]:
qa.memory.clear()

In [14]:
qa("how to run the project?")

{'question': 'how to run the project?',
 'chat_history': [SystemMessage(content='', additional_kwargs={})],
 'answer': 'The context provided does not include information on the steps to run the project.'}