## Prepare environment

In [None]:
%pip install python-dotenv

In [None]:
import os
from pathlib import Path
path_dir_script = Path(os.getcwd())
path_dir_root = path_dir_script.parent
path_code_repo_1 = Path(path_dir_root, 'data/tuist')

In [None]:
from dotenv import load_dotenv, find_dotenv
path_file_dotenv = Path(path_dir_root, '.env')
load_dotenv(path_file_dotenv)

## Load data

In [None]:
%pip install GitPython

In [None]:
from langchain.document_loaders import GitLoader # https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/git.html

In [None]:
loader = GitLoader(repo_path=path_code_repo_1, branch="main", file_filter=lambda file_path: file_path.endswith(".swift") or file_path.endswith(".h") or file_path.endswith(".m"))

In [None]:
data = loader.load()

In [None]:
len(data)

## Split text

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents, chunk_size=1000, chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  texts = text_splitter.split_documents(documents)
  return texts

texts = split_docs(data)
print(len(texts))
print(texts[0].page_content)

## Init ChromaDB

In [None]:
import os
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    openai_api_key=os.environ['OPENAI_API_KEY']
)

In [None]:
!pip install chromadb

In [None]:
import os

def check_folder_exists(folder_path):
    return os.path.exists(folder_path) and os.path.isdir(folder_path)

In [None]:
from langchain.vectorstores import Chroma

force_reembed = False
persist_directory = "chroma_db/"

if not check_folder_exists(persist_directory) or force_reembed:
    vectordb = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)
    vectordb.persist()
else:
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)


## Create the Chain

In [None]:
from typing import List
from langchain.vectorstores import Chroma

class HardcodedRetriever(BaseRetriever):
    def __init__(self, documents: [Document]):
        self.documents = documents

    def get_relevant_documents(self, query: str) -> List[Document]:
        # Ignoring query
        return self.documents
    
    async def aget_relevant_documents(self, query: str) -> List[Document]:
        # Ignoring query
        return self.documents


In [None]:
similarity_query="Tuist plugins"
num_retrieved_files=5

docs = vectordb.similarity_search(similarity_query, k=num_retrieved_files)

#for doc in docs:
#    print("DOC")
#    print(doc.page_content)

hardcodedRetriever = HardcodedRetriever(docs)

In [None]:
#from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
    model_name = "gpt-3.5-turbo", 
    temperature=0.7, 
    max_tokens=1000)

# You can test how an empty retriever makes our LLM ignorant about the context
# hardcodedRetriever = HardcodedRetriever([])

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=hardcodedRetriever)

In [None]:
llm_query = "What's localPlugin?"
qa.run(llm_query)

In [None]:
llm_query = "List all tuist features that you can call from the terminal, with examples."
qa.run(llm_query)

In [None]:
llm_query = "Can you refactor DependenciesController to remove Carthage?"
qa.run(llm_query)

In [None]:
llm_query = "Show code to refactor DependenciesController to remove Carthage. Output format: markdown"
qa.run(llm_query)