<a href="https://colab.research.google.com/github/ThetSweLynn/RAG-githubRepoReader/blob/main/RagAgent_with_githubRepoReader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install langchain langchain-community huggingface-hub openai google-search-results tiktoken wikipedia

In [None]:
import os

os.environ["OPENAI_API_KEY"] = "" #Put yout API key here

In [None]:
from langchain import OpenAI, Wikipedia
from langchain.agents import initialize_agent, Tool, AgentType
from langchain.agents.react.base import DocstoreExplorer
from langchain.utilities import SerpAPIWrapper

In [None]:
!pip -q install chromadb pypdf sentence_transformers

In [None]:
import os
import uuid
from langchain import PromptTemplate
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.document_loaders import DirectoryLoader, NotebookLoader

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#This is to clone your github repo to a new folder in google drive

#%%bash
#cd /content/drive/MyDrive/
#mkdir -p MyToDos
#cd MyToDos
#git clone https://github.com/ThetSweLin/MyToDos.git # Change repository url here

In [None]:
repo_path = "/content/drive/MyDrive/MyToDos/MyToDos" # Change repository path here

In [None]:
!pip install unstructured

In [None]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document # Import Document class

def load_and_split_files(path):
  extensions = ['txt', 'md', 'markdown', 'rst', 'py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php', 'scala', 'html', 'htm', 'xml', 'yaml', 'yml', 'ini', 'toml', 'cfg', 'conf', 'sh', 'bash', 'css', 'scss', 'sql', 'gitignore', 'dockerignore', 'editorconfig', 'dart'] # json and ipynb excluded

  documents = []

  for ext in extensions:
    glob_pattern = f'**/*.{ext}'
    try:
      loader = DirectoryLoader(repo_path, glob=glob_pattern)
      text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)

      loaded_documents = loader.load() if callable(loader.load) else []
      if loaded_documents:
        for doc in loaded_documents:
          # Extract text content from the document object (method may vary)
          text_content = doc.page_content # Assuming 'page_content' holds the text
          # Pass the text content directly to split_documents
          split_docs = text_splitter.split_documents([Document(page_content=text_content)])
          documents.extend(split_docs) # Use extend to add multiple documents

    except Exception as e:
      print(f"Error loading files with pattern '{glob_pattern}': {e}")
      continue

  return documents

# Call the function and unpack three values
documents = load_and_split_files(repo_path)

In [None]:
print(len(documents))
print(documents[10])

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document

embedding = OpenAIEmbeddings()


persist_directory = 'db'

vectordb = Chroma.from_documents(documents=documents,
                                 embedding=embedding,
                                 persist_directory=persist_directory)



In [None]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name='gpt-4')

retriever = vectordb.as_retriever(search_kwargs={"k": 3})

from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
#Updating by adding serpAPI and wikipedia

from langchain.utilities import SerpAPIWrapper

os.environ["SERPAPI_API_KEY"] = "" # Put your serp api key here

In [None]:
from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.tools import Tool

In [None]:
#to use qa_chain as a tool
def qa_chain_tool(input_text):
  response = qa_chain({"query": input_text})
  return response['result']

qa_chain_tool = Tool(
    name="qa_chain",
    func=qa_chain_tool,
    description="A tool to answer questions about the content that contains in the github repository"
)

In [None]:
tools = load_tools(["serpapi", "wikipedia"], llm=llm)
tools.append(qa_chain_tool)

In [None]:
tools[2].name, tools[2].description

In [None]:
agent = initialize_agent(tools, llm, agent="zero-shot-react-description", verbose=True)

In [None]:
agent.agent.llm_chain.prompt.template

In [None]:
# Call qa_chain_tool through the agent with the query as a string value
agent.run({"query": "What is the purpose of this repository?"})

# Process the response (optional)
process_llm_response(response)