In [None]:
%pip install PyMuPDF fastembed qdrant-client nltk

Import libraries

In [None]:
import fitz  # PyMuPDF for PDF parsing
from nltk.tokenize import sent_tokenize
from fastembed import FastEmbed
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams, Distance

# For NLTK sentence tokenization
import nltk
nltk.download('punkt')


Extract Text from PDF

In [20]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text("text")
    
    return text

# Example usage
pdf_path = "choose-health-be-active-a-physical-guide-for-older-australians.pdf"
extracted_text = extract_text_from_pdf(pdf_path)


Preprocess the Text

In [21]:
def preprocess_text(text):
    # Split the text into sentences
    sentences = sent_tokenize(text)
    return sentences

# Example usage
processed_text = preprocess_text(extracted_text)


FastEmbed and Qdrant Client

In [None]:
embedder = FastEmbed()
qdrant_client = QdrantClient(":memory:")  # Use ":memory:" for in-memory storage or provide a path for persistent storage

# Create a collection in Qdrant
collection_name = "pdf_embeddings"
qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=embedder.dim, distance=Distance.COSINE),
)


Generate Embeddings and Store Them in Qdrant

In [None]:
# Generate embeddings for each sentence
embeddings = embedder(processed_text)

# Store the embeddings in Qdrant
points = [
    PointStruct(id=i, vector=embeddings[i], payload={"text": sentence})
    for i, sentence in enumerate(processed_text)
]

qdrant_client.upsert(
    collection_name=collection_name,
    points=points
)

Query the Qdrant Database

In [None]:
# Example query
query_text = "diabetes"
query_embedding = embedder([query_text])[0]

# Perform search
search_result = qdrant_client.search(
    collection_name=collection_name,
    query_vector=query_embedding,
    limit=5  # Number of results to return
)

# Print the results
for result in search_result:
    print(f"Score: {result.score}, Text: {result.payload['text']}")