In [None]:
import os
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
import chromadb
load_dotenv()

In [None]:
PROJECT_SOURCE_FILES_PATH = ""
PROJECT_INCLUDE_FILES_PATH = "" # for cpp project
PROJECT_DESCRIPTION_PROPMT = ""

In [None]:
project_source = [PROJECT_INCLUDE_FILES_PATH,
                  PROJECT_SOURCE_FILES_PATH]
# llm = ChatOpenAI(model_name='gpt-4')
llm = OpenAI(model_name='gpt-4', max_tokens=-1)
persist_directory = "./storage"
chroma_client = chromadb.PersistentClient(path=persist_directory)

In [None]:
def iterate_over_files(project_source):
    for element in project_source:
        for root, dirs, files in os.walk(element):
            for file in files:
                yield os.path.join(root, file)

In [None]:
all_files_content = ""
concatenated_contents = PROJECT_DESCRIPTION_PROPMT + ".\n"
for file in iterate_over_files(project_source):
    print("Opening file: " + file)
    with open(file, "r", encoding="iso-8859-1") as f:
        concatenated_contents += "//" + file + "\n"
        tmp = f.read()
        concatenated_contents += tmp
        if (file.endswith(".cpp")): all_files_content += tmp + "\n"

In [None]:
# We proceed by segmenting the documents and generating their embeddings. 
text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
docs = text_splitter.split_text(concatenated_contents)
embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_texts(texts=docs, 
                            embedding=embeddings,
                            persist_directory=persist_directory)
vectordb.persist()

In [None]:
retriever = vectordb.as_retriever()
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

In [None]:
try:
    for file in iterate_over_files(project_source):
        print("Opening file: " + file)
        with open(file, "r", encoding="iso-8859-1") as f:
            tmp = f.read()
            query = "Provided the following code: " + tmp
            query += "\nPlease add to the code the necessary documentation to make code more clear, only where things are not clear from name or existing comments."
            # print(query)

            file_format = ""
            # if (file[-1] == "h"): file_format = ".h"
            # else: file_format = ".cpp"
            name = os.path.splitext(os.path.basename(file))[0] + file_format

            llm_response = qa(query)
            o = open(name, "w")
            o.write(llm_response["result"])
            o.close()
except Exception as err:
    print('Exception occurred. Please try again', str(err))
        