In [None]:
! pip install sentence_transformers
! pip install PIL
! pip install numpy
! pip install transformers


In [13]:
! pip install pypdf

Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)
Installing collected packages: pypdf
Successfully installed pypdf-5.1.0


In [42]:
! pip install gpt4all

Collecting gpt4all
  Downloading gpt4all-2.8.2-py3-none-win_amd64.whl.metadata (4.8 kB)
Downloading gpt4all-2.8.2-py3-none-win_amd64.whl (119.6 MB)
   ---------------------------------------- 0.0/119.6 MB ? eta -:--:--
   ---------------------------------------- 1.0/119.6 MB 6.3 MB/s eta 0:00:19
    --------------------------------------- 2.9/119.6 MB 8.0 MB/s eta 0:00:15
   -- ------------------------------------- 6.0/119.6 MB 10.9 MB/s eta 0:00:11
   -- ------------------------------------- 8.7/119.6 MB 11.2 MB/s eta 0:00:10
   ---- ----------------------------------- 14.4/119.6 MB 14.9 MB/s eta 0:00:08
   ------- -------------------------------- 21.5/119.6 MB 18.4 MB/s eta 0:00:06
   --------- ------------------------------ 29.1/119.6 MB 21.2 MB/s eta 0:00:05
   ----------- ---------------------------- 33.6/119.6 MB 22.9 MB/s eta 0:00:04
   ----------- ---------------------------- 33.6/119.6 MB 22.9 MB/s eta 0:00:04
   ----------- ---------------------------- 33.8/119.6 MB 17.0 MB/s

In [1]:
import os
import fitz  # PyMuPDF for handling PDF extraction
from sentence_transformers import SentenceTransformer
from PIL import Image
import io
import json
import numpy as np
import time
from transformers import ViTFeatureExtractor, ViTModel
import torch

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Initialize the Vision Transformer (ViT) model and feature extractor for image embeddings
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

  from tqdm.autonotebook import tqdm, trange


In [2]:
from langchain_community.document_loaders import PyPDFDirectoryLoader

def read_doc(directory):
    file_loader= PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

doc = read_doc('pdfs2/')
len(doc)

723

In [8]:
! pip install tiktoken

Collecting tiktoken
  Using cached tiktoken-0.8.0-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
Using cached tiktoken-0.8.0-cp311-cp311-win_amd64.whl (884 kB)
Installing collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [10]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
import tiktoken

def count_tokens(text, model="gpt-3.5-turbo"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def read_and_tokenize(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()

    # Total tokens count
    total_tokens = 0
    for doc in documents:
        total_tokens += count_tokens(doc.page_content)  # doc.page_content contains the text of the document

    return len(documents), total_tokens

doc_count, token_count = read_and_tokenize('pdfs/')
print(f"Number of documents: {doc_count}")
print(f"Total number of tokens: {token_count}")


Number of documents: 3359
Total number of tokens: 5723557


In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
def chunk_data(docs,chunk_size=400, chunk_overlap=10):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return docs

documents = chunk_data(docs=doc)
documents

[Document(metadata={'source': 'pdfs2\\Pocket Companion to Guyton and Hall 13ed.pdf', 'page': 0}, page_content=''),
 Document(metadata={'source': 'pdfs2\\Pocket Companion to Guyton and Hall 13ed.pdf', 'page': 1}, page_content='Use of the current edition of the electronic version of this book (eBook) is subject to the terms of the nontransferable, \nlimited license granted on studentconsult.inkling.com. Access to the eBook is limited to the first individual who \nredeems the PIN, located on the inside cover of this book, at studentconsult.inkling.com and may not be transferred \nto another party by resale, lending, or other means.\nAny screen.  \nAny time.  \nAnywhere.\nActivate the eBook version  \nof this title at no additional charge. \nUnlock your eBook today.\n  Visit studentconsult.inkling.com/redeem\n  Scratch off your code\n    Type code into “Enter Code” box\n  Click “Redeem”\n  Log in or Sign up\n6    Go to “My Library”\nIt’s that easy!\nStudent Consult eBooks give you the powe

In [4]:
from pinecone import Pinecone
pinecone_api_key = "pcsk_2pW6M1_TazDcA3g2rXQ9Hnr6x4BnrZpp9jzwSTQAKDCyhUWo6cWvsof1sBWvvrRnJFxubJ"
pinecone_environment = "us-east-1"  # Example: "us-east1-gcp"
index_name = "finaltest"

# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key, environment=pinecone_environment)
index = pc.Index(index_name)

In [5]:
# Embedding technique of 
from langchain_pinecone import PineconeVectorStore
from langchain.vectorstores import Pinecone
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

# Process documents and push to Pinecone
vectors = []
for i, doc in enumerate(documents):
    try:
        # Generate embeddings for the document chunk
        embedding = embeddings.embed_query(doc.page_content)
        
        # Create metadata for the chunk
        metadata = {
            "chunk_index": i,
            "text": doc.page_content,  # Store the chunk content for retrieval
            "source": doc.metadata.get("source", "unknown"),
        }
        
        # Append to vectors for bulk upsert
        vectors.append((f"doc_{i}", embedding, metadata))
    except Exception as e:
        print(f"Error processing document {i}: {e}")

# Push all vectors to Pinecone
index.upsert(vectors)
print(f"Successfully indexed {len(vectors)} document chunks into Pinecone.")


KeyboardInterrupt: 

In [7]:
import time
from math import ceil

# Define batch size (you can adjust this based on your data)
BATCH_SIZE = 100

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

# Initialize variables for time estimation
start_time = time.time()
vectors = []

# Process documents to generate vectors
for i, doc in enumerate(documents):
    try:
        # Generate embeddings for the document chunk
        embedding = embeddings.embed_query(doc.page_content)
        
        # Create metadata for the chunk
        metadata = {
            "chunk_index": i,
            "text": doc.page_content,  # Store the chunk content for retrieval
            "source": doc.metadata.get("source", "unknown"),
            "author": doc.metadata.get("author", "unknown"),  # Add author if available
            "page_number": doc.metadata.get("page_number", "unknown"),  # Add page number
            "book_title": doc.metadata.get("book_title", "unknown"),  # Add book title
        }
        
        # Append to vectors
        vectors.append((f"doc_{i}", embedding, metadata))
    except Exception as e:
        print(f"Error processing document {i}: {e}")

# Split the vectors into batches and upsert them
total_batches = ceil(len(vectors) / BATCH_SIZE)
for batch_index in range(total_batches):
    try:
        batch_start = batch_index * BATCH_SIZE
        batch_end = min(batch_start + BATCH_SIZE, len(vectors))
        batch_vectors = vectors[batch_start:batch_end]
        
        # Upsert the current batch
        index.upsert(batch_vectors)
        
        print(f"Batch {batch_index + 1}/{total_batches} upserted successfully.")
    except Exception as e:
        print(f"Error upserting batch {batch_index + 1}: {e}")

# Calculate total processing time
total_time = time.time() - start_time
print(f"Successfully indexed {len(vectors)} document chunks into Pinecone in {total_time / 60:.2f} minutes.")


Batch 1/8 upserted successfully.
Batch 2/8 upserted successfully.
Batch 3/8 upserted successfully.
Batch 4/8 upserted successfully.
Batch 5/8 upserted successfully.
Batch 6/8 upserted successfully.
Batch 7/8 upserted successfully.
Batch 8/8 upserted successfully.
Successfully indexed 723 document chunks into Pinecone in 4.87 minutes.


trying pymupdf4llm for parsing the pdf

In [1]:
! pip install -U pymupdf4llm



In [1]:
%pwd

'd:\\medical-rag'

In [None]:
import pymu

ModuleNotFoundError: No module named 'pymupdf4llm'

In [2]:
md_text = pymupdf4llm.to_markdown("input.pdf")
print(md_text)
import pathlib
pathlib.Path("output.md").write_bytes(md_text.encode())
md_text_pages = pymupdf4llm.to_markdown("input.pdf", pages=[1, 2])
md_text_pages


NameError: name 'pymupdf4llm' is not defined

In [None]:
! pip install llama_index
lama_reader = pymupdf4llm.LlamaMarkdownReader()
llama_docs = llama_reader.load_data("input.pdf")
print(f"LlamaIndex Compatiable Data: {len(llama_docs)}")
{llama_docs[0].text[:500]}


In [None]:
#Extracting images
md_text_images = pymupdf4llm.to_markdown(
    doc="input.pdf",
    pages=[0, 2],
    page_chunks=True,
    write_images=True,
    image_path="images",
    image_format="png",
    dpi=300
)
md_text_chunks = pymupdf4llm.to_markdown(
    doc="input.pdf",
    pages=[0, 1, 2],
    page_chunks=True
)
print(md_text_chunks[0])
#Detailed word-by-word extraction

md_text_words = pymupdf4llm.to_markdown(
    doc="input.pdf",
    pages=[0, 1, 2],
    page_chunks=True,
    write_images=True,
    image_path="images",
    image_format="png",
    dpi=300,
    extract_words=True
)
#Extracting tables neatly
md_text_tables = pymupdf4llm.to_markdown(
    doc="tables.pdf"
)
md_text_tables


In [None]:
from unstructured.partition.pdf import PDFPartition
import os

def extract_chunks_with_metadata(pdf_path, output_folder, metadata):
    """
    Extracts chunks based on big headings and saves them with metadata.

    Args:
        pdf_path (str): Path to the PDF file.
        output_folder (str): Folder to save extracted chunks.
        metadata (dict): Metadata dictionary containing title, author, etc.

    Returns:
        list[dict]: List of chunks with associated metadata.
    """
    # Extract text content from the PDF
    document = PDFPartition.from_file(pdf_path)

    # Initialize variables for chunking
    chunks = []
    current_heading = None
    current_content = []

    # Iterate through document elements
    for element in document:
        if element.type == "Title" or element.type == "Heading":  # Detect big headings
            # Save the current chunk if there is one
            if current_heading and current_content:
                chunks.append({
                    "heading": current_heading,
                    "content": " ".join(current_content),
                    "metadata": metadata
                })
                current_content = []

            # Update the heading
            current_heading = element.text

        else:
            # Append content under the current heading
            current_content.append(element.text)

    # Add the last chunk
    if current_heading and current_content:
        chunks.append({
            "heading": current_heading,
            "content": " ".join(current_content),
            "metadata": metadata
        })

    # Save chunks to output folder
    os.makedirs(output_folder, exist_ok=True)
    for i, chunk in enumerate(chunks):
        chunk_file = os.path.join(output_folder, f"chunk_{i + 1}.txt")
        with open(chunk_file, "w") as f:
            f.write(f"Heading: {chunk['heading']}\n")
            f.write(f"Metadata: {chunk['metadata']}\n")
            f.write(f"Content:\n{chunk['content']}\n")

    return chunks

# Example Usage
pdf_path = "example_medical_book.pdf"
output_folder = "chunks_output"
metadata = {
    "title": "Medical Book Example",
    "author": "Author Name",
    "page_number": None  # Page number can be dynamic if needed
}

chunks = extract_chunks_with_metadata(pdf_path, output_folder, metadata)

# Print the first chunk as an example
if chunks:
    print(f"Heading: {chunks[0]['heading']}")
    print(f"Metadata: {chunks[0]['metadata']}")
    print(f"Content: {chunks[0]['content'][:500]}...")  # Print the first 500 characters


[0.053597815334796906,
 -0.030782219022512436,
 -0.03252851590514183,
 -0.02810630202293396,
 0.022345904260873795]

In [None]:
from pinecone import Pinecone

# Initialize Pinecone
pc = Pinecone(api_key="pcsk_2pW6M1_TazDcA3g2rXQ9Hnr6x4BnrZpp9jzwSTQAKDCyhUWo6cWvsof1sBWvvrRnJFxubJ", environment="us-east-1")
index = pc.Index("3test")