<a href="https://colab.research.google.com/github/Sweta-Das/LangChain-HuggingFace-LLM/blob/SentenceTransformers/LangChain_Weaviate_Integration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install -q PyPDF2
%pip install -q langchain
%pip install -q transformers
%pip install -q huggingface-hub
%pip install -U weaviate-client
%pip install -q sentence-transformers

In [None]:
# Importing libraries
import os
import json
import torch
import PyPDF2
import weaviate
import numpy as np
from google.colab import drive
from google.colab import userdata
from weaviate.embedded import EmbeddedOptions
from weaviate.util import generate_uuid5
from transformers import AutoTokenizer, AutoModel

In [None]:
# Adding Weaviate and HuggingFace
HF_KEY = userdata.get("HF_TOKEN")
client = weaviate.Client(
    embedded_options=EmbeddedOptions(),
    additional_headers = {
        "X-HuggingFace-Api-Key": HF_KEY
    }
)
# Mount Google Drive
drive.mount('/content/drive/')
model = 'drive/MyDrive/LLM_Model/mistral-7b-instruct-v0.1.Q3_K_S.gguf'

embedded weaviate is already listening on port 8079
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
client.is_ready()

True

In [None]:
# Reading PDF and extracting ToC
def extract_ToC(pdf_path, start_page, end_page):

  with open(pdf_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)

    toc_entries = []

    for page in range(start_page, end_page+1):
      page = pdf_reader.pages[page]
      text = page.extract_text()
      text = text.replace("vii", "").replace("viii", "").replace("i17", "17")

      toc_lines = text.splitlines()

      for i in toc_lines:
        toc_entries.append(i)
    return toc_entries

pdf_path = "drive/MyDrive/LLM_Model//Yoga Education for Children Vol 1.pdf"
toc = extract_ToC(pdf_path, 7, 8)

In [None]:
# Table of Content
toc

['Contents',
 'Introduction  1',
 'Yoga and Education  ',
 ' 1. The Need for a Y oga-Based Education System  13',
 ' 2. Yoga and Children’s Problems  22',
 ' 3. Yoga with Pre-School Children  25',
 ' 4. Yoga Lessons Begin at Age Eight  31',
 ' 5. Student Unr est and Its Remedy  34',
 ' 6. Yoga and the Youth Problem  39',
 ' 7. Better Ways of Educatio n 45',
 ' 8. Yoga at School  50',
 ' 9. Yoga and Education  57',
 '10. Questions and Answers  65',
 'Yoga as Therapy  ',
 '11. Yoga for Emotional Disturbances  77',
 '12. Yoga for the Disabled  83',
 '13. Yoga Benefits Juvenile Diabetes  87',
 'Practices  ',
 '14. Yoga Techniques for Pre-School Children  93',
 '15. Yoga Techniques for 7–14 Y ear-Olds  101',
 '16. Yoga Techniques for the Classroom  110',
 '17. Introduction to Asana  133',
 '18. Pawanmuktasana Series  139',
 'Pawanmuktasana 1: Anti-Rheumatic Asanas  141',
 'Pawanmuktasana 2: Anti-Gastric Asanas  156',
 'Pawanmuktasana 3: Energizing Asanas  165',
 '19.  Eye Exercises  171',
 

In [None]:
# Topics to extract text from
topics = []
for topic in range(18, 29):
  topics.append(toc[topic])

In [None]:
topics

['14. Yoga Techniques for Pre-School Children  93',
 '15. Yoga Techniques for 7–14 Y ear-Olds  101',
 '16. Yoga Techniques for the Classroom  110',
 '17. Introduction to Asana  133',
 '18. Pawanmuktasana Series  139',
 'Pawanmuktasana 1: Anti-Rheumatic Asanas  141',
 'Pawanmuktasana 2: Anti-Gastric Asanas  156',
 'Pawanmuktasana 3: Energizing Asanas  165',
 '19.  Eye Exercises  171',
 '20. Surya Namaskara: Salutations to the Sun  176',
 '21. Chandra Namaskara: Salutations to the Moon  182']

In [None]:
# Separating topics and their pages
topics_page = []
for i in topics:
  # Splitting string into words
  parts = i.split()
  topic = ' '.join(parts[:-1])
  page_num = parts[-1]
  topics_page.append((topic, page_num))

In [None]:
topics_page

[('14. Yoga Techniques for Pre-School Children', '93'),
 ('15. Yoga Techniques for 7–14 Y ear-Olds', '101'),
 ('16. Yoga Techniques for the Classroom', '110'),
 ('17. Introduction to Asana', '133'),
 ('18. Pawanmuktasana Series', '139'),
 ('Pawanmuktasana 1: Anti-Rheumatic Asanas', '141'),
 ('Pawanmuktasana 2: Anti-Gastric Asanas', '156'),
 ('Pawanmuktasana 3: Energizing Asanas', '165'),
 ('19. Eye Exercises', '171'),
 ('20. Surya Namaskara: Salutations to the Sun', '176'),
 ('21. Chandra Namaskara: Salutations to the Moon', '182')]

In [None]:
# Function to extract text from specified topics
def extract_text_by_topic(topic, start_page, end_page):
  with open(pdf_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    text = ''
    for page_num in range(start_page, end_page):
      page = pdf_reader.pages[page_num]
      text += page.extract_text()
  return text

In [None]:
# Text Vectorization
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)

In [None]:
# Defining Weaviate class schema
class_schema = {
    "class": "TopicContent",
    "description": "A Document class to store topic content",
    "vectorizer": "text2vec-huggingface",
    "properties": [
        {
          "dataType": ["string"],
          "name": "topic",
          "description": "Topic name"
        },
        {
           "dataType": ["text"],
           "name": "content",
           "description": "Content related to topic"
        },
      ]
}

# Creating class based on given schema
client.schema.create_class(class_schema)

In [None]:
def generate_embedding(text):
  encoded_input = tokenizer(text, return_tensors="pt")

  with torch.no_grad():
    output = embedding_model(**encoded_input)

  # Extracting sentence embeddings
  return output.last_hidden_state[:, 0].tolist()

In [None]:
def get_sentence_embedding(text):
  sentences = text.split(". ") # Splitting by sentences
  embeddings = []
  for sentence in sentences:
    embedding = generate_embedding(sentence)
    embeddings.append(embedding)

  # Returning average of sentence embeddings
  return np.mean(embeddings, axis=0)

In [None]:
def createStore_doc(topic, text_content):
  # Generating vector embedding for the content
  embedding = get_sentence_embedding(text_content)

  # Creating class obj
  obj = {
      'topic': topic,
      'content': text_content
  }
  uuid = client.data_object.create(
      class_name = "TopicContent",
      data_object = obj,
      uuid = generate_uuid5(data_object)
  )

In [None]:
for i in range(len(topics_page)-1):
  topic, start_page = topics_page[i]
  if i+1 < len(topics_page):
    _, end_page = topics_page[i + 1]
    text = extract_text_by_topic(topic, int(start_page) + 8, int(end_page) + 8)
    createStore_doc(topic, text)

print("Text data stored successfully in Weaviate!")

In [None]:
result = client.query.get(
    class_name = 'TopicContent',
    properties = 'topic'
).do()
result

{'data': {'Get': {'TopicContent': [{'topic': '15. Yoga Techniques for 7–14 Y ear-Olds'},
    {'topic': '20. Surya Namaskara: Salutations to the Sun'},
    {'topic': '19. Eye Exercises'},
    {'topic': 'Pawanmuktasana 1: Anti-Rheumatic Asanas'},
    {'topic': '18. Pawanmuktasana Series'},
    {'topic': 'Pawanmuktasana 2: Anti-Gastric Asanas'},
    {'topic': 'Pawanmuktasana 3: Energizing Asanas'},
    {'topic': '14. Yoga Techniques for Pre-School Children'},
    {'topic': '17. Introduction to Asana'},
    {'topic': '16. Yoga Techniques for the Classroom'}]}}}

In [None]:
# Getting all data from class
query = (
    client.query.get('TopicContent', ['topic'])
    .with_additional(["id vector"])
).do()
query

{'data': {'Get': {'TopicContent': [{'_additional': {'generate': {'error': 'OpenAI API Key: no api key found neither in request header: X-Openai-Api-Key nor in environment variable under OPENAI_APIKEY',
       'singleResult': None}}},
    {'_additional': {'generate': {'error': 'OpenAI API Key: no api key found neither in request header: X-Openai-Api-Key nor in environment variable under OPENAI_APIKEY',
       'singleResult': None}}},
    {'_additional': {'generate': {'error': 'OpenAI API Key: no api key found neither in request header: X-Openai-Api-Key nor in environment variable under OPENAI_APIKEY',
       'singleResult': None}}}]}}}

In [None]:
class_schema = client.schema.get('TopicContent')
class_schema

In [None]:
# Deleting class from Weaviate
client.schema.delete_class(class_name="TopicContent")
client.data_object.delete(
    class_name='TopicContent'
)