In [18]:
from git import Repo
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
# from langchain.memory import ConversationSummaryMemory
# from langchain.chains import ConversationalRetrievalChain
import os
from dotenv import load_dotenv
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain.schema import AIMessage, HumanMessage

## get keys

In [6]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.environ.get('OPENAI_API_KEY')
os.environ["GOOGLE_API_KEY"] = os.environ.get('GOOGLE_API_KEY')

## clone an repository

In [2]:
repo_path = "test_repo/"

Repo.clone_from("https://github.com/Sumegh20/myIPYNBrenderer.git", to_path=repo_path)

<git.repo.base.Repo '/home/sumgh/Data Science/projects/code-analyzer/research/test_repo/.git'>

## Load data 

In [3]:
loader = GenericLoader.from_filesystem(path="test_repo/src/myIPYNBrenderer",
                                        glob = "**/*",
                                       suffixes=[".py"],
                                       parser = LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

documents = loader.load()

## chunking

In [4]:
documents_splitter = RecursiveCharacterTextSplitter.from_language(language = Language.PYTHON,
                                                             chunk_size = 1000,
                                                             chunk_overlap = 200)

text_splitter = documents_splitter.split_documents(documents)

## Embdding

In [7]:
embeddings=OpenAIEmbeddings(disallowed_special=())

## Create VectorDB

In [8]:
db_path = "faiss_index"
vectordb = FAISS.from_documents(text_splitter, embeddings)
vectordb.save_local(db_path)

In [9]:
kn_base = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [10]:
retriever=kn_base.as_retriever(search_type="mmr", search_kwargs={"k":3})

## Prompt

In [14]:
template="""You are an assistant for question-answering tasks while prioritizing a seamless user experience.
            Use the following pieces of retrieved context to answer the question.
            If you don't know the answer, just say that you don't know.
            Use five sentences maximum and keep the answer concise.
            You should be able to remember and reference the last three conversations between you and the user.
            Maintain a friendly, positive, and professional tone throughout interactions.
            Question: {question}
            Context: {context}
            Chat history: {chat_history}
            Answer:
        """
prompt=ChatPromptTemplate.from_template(template)


## LLM

In [11]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) #, streaming=True)

## RAG chain

In [15]:
def input_handeler(input: dict):
    return input['question']

In [16]:
rag_chain = (
            # {"context": retriever,  "question": RunnablePassthrough()}
            RunnablePassthrough().assign(
                context = input_handeler | retriever
            )
            | prompt
            | llm
            | StrOutputParser()
        )

In [17]:
chat_history = []

In [24]:
question = "What was my second previous question?"
responce = rag_chain.invoke({"question": question, "chat_history":chat_history})

print(responce)

Your second previous question was about explaining the render_site function.


In [22]:
chat_history.extend(
    [
        HumanMessage(
            role="user", 
            content=question
        ),
        AIMessage(
            role="assistant",
            content=responce
        ),
    ]  
)

## Final Pipeline

In [None]:
#clone any github repositories 
def repo_ingestion(repo_url):
    os.makedirs("repo", exist_ok=True)
    repo_path = "repo/"
    Repo.clone_from(repo_url, to_path=repo_path)


#Loading repositories as documents
def load_repo(repo_path):
    loader = GenericLoader.from_filesystem(repo_path,
                                        glob = "**/*",
                                       suffixes=[".py"],
                                       parser = LanguageParser(language=Language.PYTHON, parser_threshold=500)
                                        )
    documents = loader.load()
    return documents


#Creating text chunks 
def text_splitter(documents):
    documents_splitter = RecursiveCharacterTextSplitter.from_language(language = Language.PYTHON,
                                                             chunk_size = 1000,
                                                             chunk_overlap = 200)
    
    text_chunks = documents_splitter.split_documents(documents)
    return text_chunks


#loading embeddings model
def load_embedding():
    embeddings=OpenAIEmbeddings(disallowed_special=())
    return embeddings


#creating knowlagebase
def create_knowledgebase(texts, db_path="faiss_index"):
    embeddings = load_embedding()

    vectordb = FAISS.from_documents(texts, embeddings)
    vectordb.save_local(db_path)


#loading knowledgebase
def get_knowledge_base(db_path, embedding):
    vectordb = FAISS.load_local(db_path, embedding, allow_dangerous_deserialization=True)

    return vectordb


#Create Prompt

#Create RAG chain

#user_input