In [None]:
%pip install -q einops
%pip install -q chromadb
%pip install -q langchain
%pip install -q accelerate
# %pip install -q bitsandbytes
%pip install -q transformers

In [None]:
import os
import torch
import accelerate
import transformers
from time import time
from torch import cuda, bfloat16
from dotenv import load_dotenv, find_dotenv
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFacePipeline
from langchain.llms import LlamaCpp
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
# Setting up environment variables

load_dotenv(find_dotenv())
HF_KEY = os.environ['HUGGINGFACE_API_KEY']

In [None]:
device = f'cuda: {cuda.current_device()}' if cuda.is_available() else 'cpu'
device

In [None]:
import PyPDF2

# Reading PDF and extracting ToC
def extract_ToC(pdf_path, start_page, end_page):

  with open(pdf_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)

    toc_entries = []

    for page in range(start_page, end_page+1):
      page = pdf_reader.pages[page]
      text = page.extract_text()
      text = text.replace("vii", "").replace("viii", "").replace("i17", "17")

      toc_lines = text.splitlines()

      for i in toc_lines:
        toc_entries.append(i)
    return toc_entries

pdf_path = "Yoga Education for Children Vol 1.pdf"
toc = extract_ToC(pdf_path, 7, 8)
toc

In [None]:
# Loading documents
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("Yoga Education for Children Vol 1.pdf")
pages = loader.load()

In [None]:
# Topics to perform embedding
topics = []
for topic in range(18, 29):
  topics.append(toc[topic])
topics

# Separating topics and their pages
topics_page = []
for i in topics:
  # Splitting string into words
  parts = i.split()
  topic = ' '.join(parts[:-1])
  page_num = parts[-1]
  topics_page.append((topic, page_num))
topics

### Using ChromaDB Server Hosted on Docker

In [None]:
# Connecting to Chroma DB server through HTTP client
import chromadb

chroma_client = chromadb.HttpClient(host="localhost", port=8000)
print(chroma_client.list_collections())

In [None]:
# Creating a new collection
collection = chroma_client.create_collection(name="test_collection")
print(chroma_client.list_collections())

In [None]:
# Viewing collection data
collection = chroma_client.get_collection("test_collection")
collection.peek()

## Inserting data into collection

In [109]:
# Embedding model
import chromadb.utils.embedding_functions as embedding_functions

# Embedding function
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
    api_key=HF_KEY,
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [None]:
# Function to extract text from specified topics
def extract_text_by_topic(topic, start_page, end_page):
  pdf_path = "Yoga Education for Children Vol 1.pdf"
  
  with open(pdf_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    text = ''
    for page_num in range(start_page, end_page):
      page = pdf_reader.pages[page_num]
      text += page.extract_text()
  return text

In [None]:
import uuid

collection = chroma_client.get_collection(name="test_collection", embedding_function=huggingface_ef)

for i in range(len(topics_page) - 1):
    topic, start_page = topics_page[i]
    if i+1 < len(topics_page):
        _, end_page = topics_page[i + 1]
        text = extract_text_by_topic(topic, int(start_page)+8, int(end_page)+8)
        id = uuid.uuid1()
        metadata = {'topic': topic}
        collection.add(ids=[str(id)], documents=[text], metadatas=[metadata])

In [None]:
collection.peek(limit=1)

## Querying Database

In [None]:
query = "Yoga Techniques for Classroom"
result = collection.query(query_texts=[query], n_results=3, include=["documents", "metadatas", "distances"])
result 

In [None]:
for ids, docs, dists in zip(result['ids'], result['documents'], result['distances']):
    for id, doc, dist in zip(ids, docs, dists):
        print(f"ID: {id}, Doc: {doc}, Similarity: {1-dist}")
        print(f"\n")

## Testing RAG

In [103]:
from langchain.llms import TextGen
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_chroma import Chroma
from sentence_transformers import SentenceTransformer

In [138]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

def query_based_docs_extraction(query):
    embedding_func = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')
    db = Chroma(
        collection_name="test_collection",
        embedding_function=embedding_func,
        client=chroma_client
    )
    
    docs = db.similarity_search(query)
    for doc in docs:
        print(doc)
        print("...................")

query = "Provide steps for Surya Namaskar."
query_based_docs_extraction(query)

page_content='17620\nSurya Namaskara\n SALUTATIONS TO THE SUN\nPosition 1: Pranamasana (prayer pose)\nFace the sun if possible.\nStand erect with feet together, palms together resting on the centre of the chest, arms relaxed against the body.Breathe in and out with awareness until the breath is normal.The whole body should be completely relaxed, spinal column straight but not rigid.\nBenefits: This asana establishes a state of concentration, \ncalmness and awareness of the practice being performed.BSY ©\n177\nBSY © BSY ©\nPosition 2 Position 3\nPosition 2: Hasta Utthanasana (raised arms pose)\nInhale deeply while slowly raising the arms above the \nhead. Keep the arms separated, shoulder width apart.Tilt the pelvis while arching the back and bending the head back as far as is comfortable. The spinal column is arched slightly in the beginning. The amount of the arch can increase with practice.\nBenefits : This asana stretches all the abdominal organs fully, \nexercises the arms and shou

In [132]:
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceEndpoint

In [None]:
embedding_func = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')
db = Chroma(
        collection_name="test_collection",
        embedding_function=embedding_func,
        client=chroma_client
)

llm = HuggingFaceEndpoint(
    repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1",
    huggingfacehub_api_token = HF_KEY)

qa = RetrievalQA.from_chain_type(
    llm,
    retriever = db.as_retriever()
)

In [136]:
query = "Provide steps for Surya Namaskar."
result = qa.invoke({"query": query})
result

{'query': 'Provide steps for Surya Namaskar.',
 'result': ' Sure, here are the steps for Surya Namaskar:\n1. Pranamasana (prayer pose)\n2. Hasta Utthanasana (raised arms pose)\n3. Padahastasana (hand to foot pose)\n4. Ashwa Sanchalanasana (equestrian pose)\n5. Parvatasana (mountain pose)\n6. Ashtanga Namaskara (salute with 8 limbs)\n7. Bhujangasana (cobra pose)\n8. Parvatasana (mountain pose)\n9. Ashwa Sanchalanasana (equestrian pose)\n10. Padahastasana (hand to foot pose)\n11. Hasta Utthanasana (raised arms pose)\n12. Pranamasana (prayer pose)\nPositions 13–24 : Positions 1–12 constitute half a round. The second half round consists of the same twelve positions. The only difference is in position 16 where the left leg is taken back, and in position 21 where the right foot is brought forward between the hands. Start the second half round when the heartbeat and breath have returned to normal. When a number of full rounds have been completed, lie in shavasana to rest and relax the body an

## Deleting in Chroma DB

In [None]:
# Deleting items in collection
collection.delete()

In [None]:
# Deleting collection
chroma_client.delete_collection(name='test_collection')