In [1]:
!pip install langchain
!pip install openai
!pip install chromadb
!pip install pypdf
!pip install gpt4all
!pip install langchain_community

Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/817.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/817.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.32 (from langchain)
  Downloading langchain_community-0.0.34-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m69.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2.0,>=0.1.42 (from langchain)
  Downl

### For the creation of RAG, the documents provided were our own resumes, which were chunked and added to a vector storage

In [1]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
def load_pdfs(path):
    document_loader = PyPDFDirectoryLoader(path)
    return document_loader.load()

In [2]:
documents = load_pdfs('/content/drive/MyDrive/Colab Notebooks/Resume for RAG')

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def split_documents(documents: list[Document]):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                                 chunk_overlap=50,
                                                 length_function=len,
                                                 is_separator_regex= False)
  return text_splitter.split_documents(documents)


In [4]:
chunks = split_documents(documents)

### GPT4All Embeddings were used for word embedding as they are open source and do not require a seperate api call to access

In [5]:
from langchain_community.embeddings import GPT4AllEmbeddings
def get_embeddings_function():
  embeddings = GPT4AllEmbeddings()
  return embeddings

In [6]:
def calculate_chunk_ids(chunks):

    # This will create IDs like Page Source : Page Number : Chunk Index
    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

### Creating a vector store to retain chunks of data that were created from the dataset provided

In [7]:
from langchain.vectorstores import Chroma
CHROMA_PATH = "/content/drive/MyDrive/Colab Notebooks/Chromadb"
def add_to_db(chunks: list[Document]):
  embeddings = get_embeddings_function()
  db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings, collection_metadata={"hnsw:space": "cosine"})
  chunks_with_ids = calculate_chunk_ids(chunks)

  # Add or Update the documents.
  existing_items = db.get(include=[])  # IDs are always included by default
  existing_ids = set(existing_items["ids"])
  print(f"Number of existing documents in DB: {len(existing_ids)}")

  # Only add documents that don't exist in the DB.
  new_chunks = []
  for chunk in chunks_with_ids:
      if chunk.metadata["id"] not in existing_ids:
          new_chunks.append(chunk)

  if len(new_chunks):
      print(f"Adding new documents: {len(new_chunks)}")
      new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
      db.add_documents(new_chunks, ids=new_chunk_ids)
      db.persist()
  else:
      print("No new documents to add")
  db.add_documents(chunks)
  return db

In [8]:
import os
import shutil
CHROMA_PATH = "/content/drive/MyDrive/Colab Notebooks/Chromadb"
def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

In [9]:
clear_database()

In [10]:
db = add_to_db(chunks)

Number of existing documents in DB: 0
Adding new documents: 8


In [14]:
cosine_similarity = db.similarity_search_with_relevance_scores(query = 'What is the name of the student?', k =1)

##### Cosine Similarity based on just the vector datastore

In [15]:
#Source retrieved based on just the db
cosine_similarity[0][0]

Document(page_content='foundational education equipped them with a solid understanding of key concepts in \nDBMS, OOPS, and Business Communications, laying the  groundwork for their future \nendeavors in data science.  \n \nJoining Globe Life as a Data Science Intern, Omkar Patade  embraced the opportunity to \nwork alongside seasoned professionals, delving into the world of data pipelines and \npredictive modeling. Leveraging their expertise in Python and SQL, Omkar Patade', metadata={'id': '/content/drive/MyDrive/Colab Notebooks/Resume for RAG/Info for RAG.pdf:0:2', 'page': 0, 'source': '/content/drive/MyDrive/Colab Notebooks/Resume for RAG/Info for RAG.pdf'})

In [16]:
cosine_similarity[0][1]

0.3087102938043259

In [13]:
!pip install transformers



### Microsoft phi was taken as the model of choice as it's small size is suited when amount of data for context generation is low and it takes us less resources

In [17]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="microsoft/Phi-3-mini-4k-instruct",
    task="text-generation",
    pipeline_kwargs={"max_new_tokens": 256},
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


The repository for microsoft/Phi-3-mini-4k-instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/Phi-3-mini-4k-instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
The repository for microsoft/Phi-3-mini-4k-instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/Phi-3-mini-4k-instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [22]:
from langchain.prompts import ChatPromptTemplate
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}
---
Answer the question based on the above context: {question}
"""

def query_rag(query_text):
    # Prepare the DB.
    embedding_function = get_embeddings_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_score(query_text, k = 3)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    chain = hf
    response_text = hf.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    return response_text

In [29]:
answer = query_rag("Tell me the degree's that the student has")

In [30]:
print(answer)

Human: 
Answer the question based only on the following context:

Oklahoma State University, USA, where they achieved a remarkable GPA of 3.9/4.0. Their 
coursework in Data Warehousing, Statistics, and Predictive Analytics provided them with a 
strong theoretical background, which they have since applied in real -world settings.  
 
Prior to their master's degree, Omkar Patade  completed a Bachelor of Engineering in 
Electronics Engineering at Mumbai University, India, with a GPA of 3.36/4.0. This

---

Oklahoma State University, USA, where they achieved a remarkable GPA of 3.9/4.0. Their 
coursework in Data Warehousing, Statistics, and Predictive Analytics provided them with a 
strong theoretical background, which they have since applied in real -world settings.  
 
Prior to their master's degree, Omkar Patade  completed a Bachelor of Engineering in 
Electronics Engineering at Mumbai University, India, with a GPA of 3.36/4.0. This

---

ahead and confident in their ability to make a m