<a href="https://colab.research.google.com/github/Sweta-Das/LangChain-HuggingFace-LLM/blob/SentenceTransformers/Text_Embedding%26Search_with_Weaviate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install -q PyPDF2
%pip install -q langchain
%pip install -q transformers
%pip install -q huggingface-hub
%pip install -q weaviate-client
%pip install -q sentence-transformers

In [None]:
# Importing libraries
import os
import json
import torch
import PyPDF2
import weaviate
import numpy as np
from google.colab import drive
from google.colab import userdata
from weaviate.embedded import EmbeddedOptions
from transformers import AutoTokenizer, AutoModel

In [None]:
# Adding Weaviate and HuggingFace
HF_KEY = userdata.get("HF_TOKEN")
client = weaviate.Client(
    embedded_options=EmbeddedOptions(),
    additional_headers = {
        "X-HuggingFace-Api-Key": HF_KEY,
        "X-Mistral-Api-Key": HF_KEY
    }
)
# Mount Google Drive
drive.mount('/content/drive/')
model = 'drive/MyDrive/LLM_Model/mistral-7b-instruct-v0.1.Q3_K_S.gguf'

embedded weaviate is already listening on port 8079
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
client.is_ready()

True

In [None]:
# Reading PDF and extracting ToC
def extract_ToC(pdf_path, start_page, end_page):

  with open(pdf_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)

    toc_entries = []

    for page in range(start_page, end_page+1):
      page = pdf_reader.pages[page]
      text = page.extract_text()
      text = text.replace("vii", "").replace("viii", "").replace("i17", "17")

      toc_lines = text.splitlines()

      for i in toc_lines:
        toc_entries.append(i)
    return toc_entries

pdf_path = "drive/MyDrive/LLM_Model//Yoga Education for Children Vol 1.pdf"
toc = extract_ToC(pdf_path, 7, 8)

In [None]:
# Table of Content
toc

['Contents',
 'Introduction  1',
 'Yoga and Education  ',
 ' 1. The Need for a Y oga-Based Education System  13',
 ' 2. Yoga and Children’s Problems  22',
 ' 3. Yoga with Pre-School Children  25',
 ' 4. Yoga Lessons Begin at Age Eight  31',
 ' 5. Student Unr est and Its Remedy  34',
 ' 6. Yoga and the Youth Problem  39',
 ' 7. Better Ways of Educatio n 45',
 ' 8. Yoga at School  50',
 ' 9. Yoga and Education  57',
 '10. Questions and Answers  65',
 'Yoga as Therapy  ',
 '11. Yoga for Emotional Disturbances  77',
 '12. Yoga for the Disabled  83',
 '13. Yoga Benefits Juvenile Diabetes  87',
 'Practices  ',
 '14. Yoga Techniques for Pre-School Children  93',
 '15. Yoga Techniques for 7–14 Y ear-Olds  101',
 '16. Yoga Techniques for the Classroom  110',
 '17. Introduction to Asana  133',
 '18. Pawanmuktasana Series  139',
 'Pawanmuktasana 1: Anti-Rheumatic Asanas  141',
 'Pawanmuktasana 2: Anti-Gastric Asanas  156',
 'Pawanmuktasana 3: Energizing Asanas  165',
 '19.  Eye Exercises  171',
 

In [None]:
# Topics to extract text from
topics = []
for topic in range(18, 29):
  topics.append(toc[topic])

In [None]:
topics

['14. Yoga Techniques for Pre-School Children  93',
 '15. Yoga Techniques for 7–14 Y ear-Olds  101',
 '16. Yoga Techniques for the Classroom  110',
 '17. Introduction to Asana  133',
 '18. Pawanmuktasana Series  139',
 'Pawanmuktasana 1: Anti-Rheumatic Asanas  141',
 'Pawanmuktasana 2: Anti-Gastric Asanas  156',
 'Pawanmuktasana 3: Energizing Asanas  165',
 '19.  Eye Exercises  171',
 '20. Surya Namaskara: Salutations to the Sun  176',
 '21. Chandra Namaskara: Salutations to the Moon  182']

In [None]:
# Separating topics and their pages
topics_page = []
for i in topics:
  # Splitting string into words
  parts = i.split()
  topic = ' '.join(parts[:-1])
  page_num = parts[-1]
  topics_page.append((topic, page_num))

In [None]:
topics_page

[('14. Yoga Techniques for Pre-School Children', '93'),
 ('15. Yoga Techniques for 7–14 Y ear-Olds', '101'),
 ('16. Yoga Techniques for the Classroom', '110'),
 ('17. Introduction to Asana', '133'),
 ('18. Pawanmuktasana Series', '139'),
 ('Pawanmuktasana 1: Anti-Rheumatic Asanas', '141'),
 ('Pawanmuktasana 2: Anti-Gastric Asanas', '156'),
 ('Pawanmuktasana 3: Energizing Asanas', '165'),
 ('19. Eye Exercises', '171'),
 ('20. Surya Namaskara: Salutations to the Sun', '176'),
 ('21. Chandra Namaskara: Salutations to the Moon', '182')]

In [None]:
# Function to extract text from specified topics
def extract_text_by_topic(topic, start_page, end_page):
  with open(pdf_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    text = ''
    for page_num in range(start_page, end_page):
      page = pdf_reader.pages[page_num]
      text += page.extract_text()
  return text

In [None]:
# Text Vectorization
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)

In [None]:
# Defining Weaviate class schema
class_schema = {
    "class": "TopicContent",
    "description": "A Document class to store topic content",
    "properties": [
        {
          "dataType": ["string"],
          "name": "topic",
          "description": "Topic name"
        },
        {
           "dataType": ["text"],
           "name": "content",
           "description": "Content related to topic"
        },
      ],
    "vectorizer": "text2vec-huggingface",
    "moduleConfig":{
        "generative-mistral": {"model": "mistral-medium-latest"}
    }
}

# Creating class based on given schema
client.schema.create_class(class_schema)

Embedded weaviate wasn't listening on ports http:8079 & grpc:50060, so starting embedded weaviate again
Started /root/.cache/weaviate-embedded: process ID 26114


In [None]:
def generate_embedding(text):
  encoded_input = tokenizer(text, return_tensors="pt")

  with torch.no_grad():
    output = embedding_model(**encoded_input)

  # Extracting sentence embeddings
  return output.last_hidden_state[:, 0].tolist()

In [None]:
def get_sentence_embedding(text):
  sentences = text.split(". ") # Splitting by sentences
  embeddings = []
  for sentence in sentences:
    embedding = generate_embedding(sentence)
    embeddings.append(embedding)

  # Returning average of sentence embeddings
  return np.mean(embeddings, axis=0)

In [None]:
def createStore_doc(topic, text_content):
  # Generating vector embedding for the content
  embedding = get_sentence_embedding(text_content)

  # Creating class obj
  obj = {
      'topic': topic,
      'content': text_content
  }
  uuid = client.data_object.create(
      class_name = "TopicContent",
      data_object = obj,
      # uuid = generate_uuid5(data_object)
  )

In [None]:
for i in range(len(topics_page)-1):
  topic, start_page = topics_page[i]
  if i+1 < len(topics_page):
    _, end_page = topics_page[i + 1]
    text = extract_text_by_topic(topic, int(start_page) + 8, int(end_page) + 8)
    createStore_doc(topic, text)

print("Text data stored successfully in Weaviate!")

Text data stored successfully in Weaviate!


In [None]:
result = client.query.get(
    class_name = 'TopicContent',
    properties = 'topic'
).do()
result

{'data': {'Get': {'TopicContent': [{'topic': '17. Introduction to Asana'},
    {'topic': 'Pawanmuktasana 2: Anti-Gastric Asanas'},
    {'topic': '18. Pawanmuktasana Series'},
    {'topic': 'Pawanmuktasana 1: Anti-Rheumatic Asanas'},
    {'topic': '15. Yoga Techniques for 7–14 Y ear-Olds'},
    {'topic': '16. Yoga Techniques for the Classroom'},
    {'topic': '14. Yoga Techniques for Pre-School Children'},
    {'topic': 'Pawanmuktasana 3: Energizing Asanas'},
    {'topic': '19. Eye Exercises'},
    {'topic': '20. Surya Namaskara: Salutations to the Sun'}]}}}

In [None]:
# Getting all data from class
query = (
    client.query.get('TopicContent', ['topic'])
    .with_additional(["id vector"])
).do()
query

{'data': {'Get': {'TopicContent': [{'_additional': {'id': '03d89476-ede1-4c57-8eac-3115f192a904',
      'vector': [-0.2750278,
       0.026527535,
       0.007987334,
       -0.27482182,
       -0.12607503,
       0.4649102,
       0.2874446,
       0.38982862,
       -0.24813212,
       -0.708228,
       0.054335758,
       0.39623767,
       0.13863927,
       0.10461487,
       0.0016810372,
       0.38891032,
       0.37098622,
       -0.019133374,
       0.06832895,
       -0.42119747,
       -0.16151181,
       0.092318535,
       0.1472773,
       0.11944567,
       -0.038154326,
       -0.049132094,
       -0.04337929,
       0.21700737,
       -0.11090485,
       -0.05912646,
       0.27229625,
       -0.039764587,
       -0.10721911,
       0.026357919,
       0.45576724,
       -0.48115656,
       -0.25598478,
       0.114236474,
       0.28507984,
       -0.13034727,
       0.14093241,
       0.17413259,
       -0.1612202,
       0.09195686,
       0.023440123,
       -0.47

In [None]:
class_schema = client.schema.get('TopicContent')
class_schema

{'class': 'TopicContent',
 'description': 'A Document class to store topic content',
 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
  'cleanupIntervalSeconds': 60,
  'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
 'moduleConfig': {'generative-mistral': {'model': 'mistral-medium-latest'},
  'text2vec-huggingface': {'vectorizeClassName': True}},
 'multiTenancyConfig': {'enabled': False},
 'properties': [{'dataType': ['text'],
   'description': 'Topic name',
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-huggingface': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'topic',
   'tokenization': 'whitespace'},
  {'dataType': ['text'],
   'description': 'Content related to topic',
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-huggingface': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'content',
   'tokenization': 'word'}],
 'replicationConfig': 

In [None]:
generate_prompt = "Summarize content: {content} of topic: {18. Pawanmuktasana Series} in 5 lines."
res = client.query.get("TopicContent", ["topic", "content"])\
                       .with_generate(single_prompt=generate_prompt)\
                       .with_limit(1)\
                       .do()

In [None]:
res

{'data': {'Get': {'TopicContent': [{'_additional': {'generate': {'error': 'OpenAI API Key: no api key found neither in request header: X-Openai-Api-Key nor in environment variable under OPENAI_APIKEY',
       'singleResult': None}},
     'content': '13918\nPawanmuktasana Series\nPawanmuktasana is a group of exercises which release  \nwind and gases from the body. Pawan means ‘wind’; \nmukta means ‘release’, asana  means ‘posture’. The pawan -\nmuktasana series is very simple, yet it is most effective in \nregulating what are referred to in India as the humours: phlegm or kapha , wind or vata and acid/bile, pitta.\n According to the ancient medical science known as \nayurveda, these three humours control all the functions of the body. If any irregularity arises in their functions, negative reactions take place in the metabolism of the body and disease results.\nPhysical yoga \nPawanmuktasana are simple exercises for the development \nof body awareness. During the practice of each exerci

In [None]:
# Deleting class and its objects
client.schema.delete_class("TopicContent")