## Prepare environment

In [None]:
%pip install python-dotenv

In [None]:
import os
from pathlib import Path
path_dir_script = Path(os.getcwd())
path_dir_root = path_dir_script.parent
path_code_repo_1 = Path(path_dir_root, 'data/ios-1000kB/open-in-place')

In [None]:
from dotenv import load_dotenv, find_dotenv
path_file_dotenv = Path(path_dir_root, '.env')
load_dotenv(path_file_dotenv)

## Load data

In [None]:
%pip install GitPython

In [None]:
from langchain.document_loaders import GitLoader # https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/git.html

In [None]:
loader = GitLoader(repo_path=path_code_repo_1, branch="main", file_filter=lambda file_path: file_path.endswith(".swift") or file_path.endswith(".h") or file_path.endswith(".m"))

In [None]:
data = loader.load()

In [None]:
len(data)

## Split text

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents, chunk_size=1000, chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  texts = text_splitter.split_documents(documents)
  return texts

texts = split_docs(data)
print(len(texts))
print(texts[0].page_content)

## Embedding

In [None]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

embeddings = HuggingFaceInstructEmbeddings(
    query_instruction="Represent the query for retrival:"
)

In [None]:
#text = "This is a test document."
#query_result = embeddings.embed_query(text)
#dimension = len(query_result)
#print(dimension)

## Init ChromaDB

In [None]:
%pip install chromadb

In [None]:
from langchain.vectorstores import Chroma

vectordb = Chroma.from_documents(texts, embeddings)

## Create the Chain

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import VectorDBQA

llm = ChatOpenAI(
    model_name = "gpt-3.5-turbo", 
    temperature=0.7, 
    max_tokens=1000
    )

qa = VectorDBQA.from_chain_type(llm=llm, chain_type="stuff", vectorstore=vectordb)

In [None]:
query = "What happens in the SceneDelegate?"
result = qa.run(query)

In [None]:
from utils.save_as_markdown import save_as_markdown
save_as_markdown(result, base_folder=Path(path_dir_root, "output"), base_name="", extension="md")
print(result)

In [None]:
query = "Show explanation and most important lines of code for the XCallbackOpener"
result = qa.run(query)

In [None]:
save_as_markdown(result, base_folder=Path(path_dir_root, "output"), base_name="answer", extension="md")
print(result)

In [None]:
query = "Your task is to answer the following query, and give the useful query extracts for it. 10 ways the XCallbackOpener can be improved. Output format: markdown"
result = qa.run(query)

In [None]:
save_as_markdown(result, base_folder=Path(path_dir_root, "output"), base_name="answer", extension="md")
print(result)