In [None]:
%pip install -q einops
%pip install -q chromadb
%pip install -q langchain
%pip install -q accelerate
# %pip install -q bitsandbytes
%pip install -q transformers

In [1]:
import os
import torch
import accelerate
import transformers
from time import time
from torch import cuda, bfloat16
from dotenv import load_dotenv, find_dotenv
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFacePipeline
from langchain.llms import LlamaCpp
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Setting up environment variables

load_dotenv(find_dotenv())
HF_KEY = os.environ['HUGGINGFACE_API_KEY']

In [3]:
device = f'cuda: {cuda.current_device()}' if cuda.is_available() else 'cpu'
device

'cpu'

In [4]:
import PyPDF2

# Reading PDF and extracting ToC
def extract_ToC(pdf_path, start_page, end_page):

  with open(pdf_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)

    toc_entries = []

    for page in range(start_page, end_page+1):
      page = pdf_reader.pages[page]
      text = page.extract_text()
      text = text.replace("vii", "").replace("viii", "").replace("i17", "17")

      toc_lines = text.splitlines()

      for i in toc_lines:
        toc_entries.append(i)
    return toc_entries

pdf_path = "Yoga Education for Children Vol 1.pdf"
toc = extract_ToC(pdf_path, 7, 8)
toc

['Contents',
 'Introduction  1',
 'Yoga and Education  ',
 ' 1. The Need for a Y oga-Based Education System  13',
 ' 2. Yoga and Children’s Problems  22',
 ' 3. Yoga with Pre-School Children  25',
 ' 4. Yoga Lessons Begin at Age Eight  31',
 ' 5. Student Unr est and Its Remedy  34',
 ' 6. Yoga and the Youth Problem  39',
 ' 7. Better Ways of Educatio n 45',
 ' 8. Yoga at School  50',
 ' 9. Yoga and Education  57',
 '10. Questions and Answers  65',
 'Yoga as Therapy  ',
 '11. Yoga for Emotional Disturbances  77',
 '12. Yoga for the Disabled  83',
 '13. Yoga Benefits Juvenile Diabetes  87',
 'Practices  ',
 '14. Yoga Techniques for Pre-School Children  93',
 '15. Yoga Techniques for 7–14 Y ear-Olds  101',
 '16. Yoga Techniques for the Classroom  110',
 '17. Introduction to Asana  133',
 '18. Pawanmuktasana Series  139',
 'Pawanmuktasana 1: Anti-Rheumatic Asanas  141',
 'Pawanmuktasana 2: Anti-Gastric Asanas  156',
 'Pawanmuktasana 3: Energizing Asanas  165',
 '19.  Eye Exercises  171',
 

In [5]:
# Loading documents
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("Yoga Education for Children Vol 1.pdf")
pages = loader.load()

In [12]:
# Topics to perform embedding
topics = []
for topic in range(18, 29):
  topics.append(toc[topic])
topics

# Separating topics and their pages
topics_page = []
for i in topics:
  # Splitting string into words
  parts = i.split()
  topic = ' '.join(parts[:-1])
  page_num = parts[-1]
  topics_page.append((topic, page_num))
topics

['14. Yoga Techniques for Pre-School Children  93',
 '15. Yoga Techniques for 7–14 Y ear-Olds  101',
 '16. Yoga Techniques for the Classroom  110',
 '17. Introduction to Asana  133',
 '18. Pawanmuktasana Series  139',
 'Pawanmuktasana 1: Anti-Rheumatic Asanas  141',
 'Pawanmuktasana 2: Anti-Gastric Asanas  156',
 'Pawanmuktasana 3: Energizing Asanas  165',
 '19.  Eye Exercises  171',
 '20. Surya Namaskara: Salutations to the Sun  176',
 '21. Chandra Namaskara: Salutations to the Moon  182']

### Using ChromaDB Server Hosted on Docker

In [9]:
# Connecting to Chroma DB server through HTTP client
import chromadb

chroma_client = chromadb.HttpClient(host="localhost", port=8000)
print(chroma_client.list_collections())

[]


In [52]:
# Creating a new collection
collection = chroma_client.create_collection(name="test_collection")
print(chroma_client.list_collections())

[Collection(name=test_collection)]


In [11]:
# Viewing collection data
collection = chroma_client.get_collection("test_collection")
collection.peek()

{'ids': [],
 'embeddings': [],
 'metadatas': [],
 'documents': [],
 'data': None,
 'uris': None}

## Inserting data into collection

In [13]:
# Embedding model
import chromadb.utils.embedding_functions as embedding_functions

# Embedding function
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
    api_key=HF_KEY,
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [53]:
# Function to extract text from specified topics
def extract_text_by_topic(topic, start_page, end_page):
  pdf_path = "Yoga Education for Children Vol 1.pdf"
  
  with open(pdf_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    text = ''
    for page_num in range(start_page, end_page):
      page = pdf_reader.pages[page_num]
      text += page.extract_text()
  return text

In [54]:
import uuid

collection = chroma_client.get_collection(name="test_collection", embedding_function=huggingface_ef)

for i in range(len(topics_page) - 1):
    topic, start_page = topics_page[i]
    if i+1 < len(topics_page):
        _, end_page = topics_page[i + 1]
        text = extract_text_by_topic(topic, int(start_page)+8, int(end_page)+8)
        id = uuid.uuid1()
        metadata = {'topic': topic}
        collection.add(ids=[str(id)], documents=[text], metadatas=[metadata])

In [65]:
collection.peek(limit=1)

{'ids': ['ea04a9e6-fa4f-11ee-89e4-3b528f51ba85'],
 'embeddings': [[0.03178082033991814,
   -0.014308194629848003,
   0.020488321781158447,
   -0.018543666228652,
   -0.011784251779317856,
   0.07936473935842514,
   0.0047219134867191315,
   0.003363583702594042,
   0.07102047652006149,
   0.04601683095097542,
   0.0948043093085289,
   0.03229439631104469,
   -0.010760650038719177,
   0.044999610632658005,
   0.06365380436182022,
   -0.0009564429055899382,
   0.07723387330770493,
   -0.0021310183219611645,
   -0.024194015190005302,
   0.026054035872220993,
   0.02135578542947769,
   0.03273766115307808,
   0.059486791491508484,
   -0.026067141443490982,
   0.046340443193912506,
   0.05970329791307449,
   0.03996829688549042,
   -0.12109069526195526,
   0.047929294407367706,
   0.01855282671749592,
   0.06116781011223793,
   -0.02389104850590229,
   0.03870520368218422,
   -0.010221825912594795,
   -0.07133954763412476,
   0.08384060859680176,
   0.05009103938937187,
   -0.00223168777301

## Querying Database

In [57]:
query = "Yoga Techniques for Classroom"
result = collection.query(query_texts=[query], n_results=3, include=["documents", "metadatas", "distances"])
result 

{'ids': [['f2bd4085-fa4f-11ee-89e4-3b528f51ba85',
   'ea04a9e6-fa4f-11ee-89e4-3b528f51ba85',
   'f2bd4084-fa4f-11ee-89e4-3b528f51ba85']],
 'distances': [[0.444814380825631, 0.5984121937430923, 0.6299941230113644]],
 'embeddings': None,
 'metadatas': [[{'topic': '16. Yoga Techniques for the Classroom'},
   {'topic': '14. Yoga Techniques for Pre-School Children'},
   {'topic': '15. Yoga Techniques for 7–14 Y ear-Olds'}]],
 'documents': [['11016\nYoga Techniques for the \nClassroom\nSwami Yogabhakti Saraswati\nThe techniques described in this chapter are for use in  \nnormal school classrooms, even those with fixed desks \nand seats. Most of the exercises can be done while standing \nand moving between the desks, others can even be done while sitting at the desk.\nLimbering up and developing body awareness\nThe following exercises are described in chapter 18. In the \nclassroom it is not even necessary for the children to remove their shoes in order to perform them.\n 1. Ankle bending (5 

In [63]:
for ids, docs, dists in zip(result['ids'], result['documents'], result['distances']):
    for id, doc, dist in zip(ids, docs, dists):
        print(f"ID: {id}, Doc: {doc}, Similarity: {1-dist}")
        print(f"\n")

ID: f2bd4085-fa4f-11ee-89e4-3b528f51ba85, Doc: 11016
Yoga Techniques for the 
Classroom
Swami Yogabhakti Saraswati
The techniques described in this chapter are for use in  
normal school classrooms, even those with fixed desks 
and seats. Most of the exercises can be done while standing 
and moving between the desks, others can even be done while sitting at the desk.
Limbering up and developing body awareness
The following exercises are described in chapter 18. In the 
classroom it is not even necessary for the children to remove their shoes in order to perform them.
 1. Ankle bending (5 times).
 2. Ankle rotation (5 times clockwise then 5 times anti-clock -
wise). Either both together or one at a time, depending upon the time available.
 3. Knee bending (5 times each leg).
 4. Squat and rise pose (5 times).
 5. Hand clenching (5 tim es).
 6. Wrist bending (5 times).
 7. Wrist rotation (5 times clockwise/anti-clockwise). Each 
wrist individually or both together.
 8. Elbow bending (5 t

## Testing RAG

## Deleting in Chroma DB

In [None]:
# Deleting items in collection
collection.delete()

In [51]:
# Deleting collection
chroma_client.delete_collection(name='test_collection')