In [2]:
from git import Repo
# For cloning github repo

from langchain.text_splitter import Language
# Required for context aware splitting. Splitting the code into chunks. Each block of function or class is split into chunks of code
# Eg: def main() --> code inside the func is split into chunks of code

from langchain.document_loaders.generic import GenericLoader
# Inorder to load the github repository, genericloader is required

from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

In [1]:
!pip install -q GitPython

### Cloning the repository

In [4]:
repo_path = "test_repo/"
#Repo.clone_from("https://github.com/Sainivedhana/llama", to_path=repo_path)

In [5]:
loader = GenericLoader.from_filesystem(repo_path,
                                        glob = "**/*",
                                       suffixes=[".py"],
                                       parser = LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

In [6]:
documents = loader.load()
documents

[Document(page_content='', metadata={'source': 'test_repo\\main.py', 'language': <Language.PYTHON: 'python'>}),
 Document(page_content='from langchain import PromptTemplate\nfrom langchain.chains import RetrievalQA\nfrom langchain.embeddings import HuggingFaceEmbeddings\nfrom langchain.vectorstores import FAISS\nfrom langchain.document_loaders import PyPDFLoader, DirectoryLoader\nfrom langchain.text_splitter import RecursiveCharacterTextSplitter\nfrom langchain.llms import CTransformers\nfrom src.helper import *\nfrom flask import Flask, render_template, jsonify, request\n\n#Load the PDF File\nloader=DirectoryLoader(\'data/\',\n                       glob="*.pdf",\n                       loader_cls=PyPDFLoader)\n\ndocuments=loader.load()\n\n#Split Text into Chunks\ntext_splitter=RecursiveCharacterTextSplitter(\n                                             chunk_size=500,\n                                             chunk_overlap=50)\ntext_chunks=text_splitter.split_documents(documents

### Context Aware chunkings

In [7]:
documents_splitter = RecursiveCharacterTextSplitter.from_language(language = Language.PYTHON,
                                                             chunk_size = 2000,
                                                             chunk_overlap = 200)
texts = documents_splitter.split_documents(documents)
len(texts)

7

### LLM Wrapper

In [17]:
from langchain import HuggingFaceHub

HUGGING_FACE_TOKEN = "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"

llm = HuggingFaceHub(repo_id="microsoft/Phi-3-mini-4k-instruct",huggingfacehub_api_token=HUGGING_FACE_TOKEN)

In [9]:
from langchain.embeddings import SentenceTransformerEmbeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")



In [10]:
# Creating vectordb
from langchain.vectorstores import Chroma

persist_directory = './data'
vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embeddings,
                                 persist_directory=persist_directory)

In [11]:
# Saving created db in disk

vectordb.persist()
vectordb = None

In [12]:
# Loading the saved vectordb

vectordb = Chroma(embedding_function=embeddings,
                                 persist_directory=persist_directory)

In [18]:
memory = ConversationSummaryMemory(llm=llm, memory_key = "chat_history", return_messages=True)

In [19]:
qa = ConversationalRetrievalChain.from_llm(llm, retriever=vectordb.as_retriever(search_type="mmr", search_kwargs={"k":3}), memory=memory)

In [20]:
question = "what is the value of B_INST?"

In [21]:
result = qa(question)
print(result['answer'])

Number of requested results 20 is greater than number of elements in index 7, updating n_results = 7


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

# This file contains the default and custom system prompt. This needs to be imported in main file to access the prompt

DEFAULT_SYSTEM_PROMPT="""\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. 
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. 
Please ensure that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of 
answering something not correct. If you don't know the answer to a question,
please don't share false information."""

CUSTOM_SYSTEM_PROMPT="""\
You are an advanced assistant that provides summarization given any book name"""

template="""Use the following pieces of information to answer the u