In [1]:
import pdfplumber
import torch
from transformers import AutoTokenizer, AutoModel
import pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from each page of the provided PDF file.
    
    Args:
        pdf_path (str): The path to the PDF file.
        
    Returns:
        str: A string containing the extracted text from the PDF.
    """
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"  # Extract text only
    return text.strip()

# Usage
text = extract_text_from_pdf("book.pdf")

In [4]:
def clean_extracted_text(text):
    """
    Cleans the extracted text by removing extra spaces and unwanted newlines.
    
    Args:
        text (str): The raw text extracted from the PDF.
        
    Returns:
        str: The cleaned text.
    """
    # Remove multiple spaces and replace with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove unwanted newlines but keep structured ones
    text = re.sub(r'\n+', '\n', text).strip()
    
    return text

In [5]:
# Clean the extracted text
text = clean_extracted_text(text)

In [6]:
text

'ROXBURGH’S Common Skin Diseases 17th Edition Ronald Marks Emeritus Professor of Dermatology and Former Head of Department of Dermatology University of Wales College of Medicine Cardiff,UK Clinical Professor Department of Dermatology and Skin Surgery University of Miami School of Medicine Miami,USA • • Hodder Arnold A member of the Hodder Headline Group London First published in Great Britain in 2003 by Arnold,a member of the Hodder Headline Group, 338 Euston Road,London NW1 3BH http://www.arnoldpublishers.com Distributed in the United States of America by Oxford University Press Inc., 198 Madison Avenue,New York,NY10016 Oxford is a registered trademark of Oxford University Press © 2003 Arnold All rights reserved. No part of this publication may be reproduced or transmitted in any form or by any means,electronically or mechanically,including photocopying,recording or any information storage or retrieval system,without either prior permission in writing from the publisher or a licence p

In [9]:
def chunk_text(text, chunk_size=500, chunk_overlap=50):
    """
    Splits the cleaned text into smaller chunks for processing.
    
    Args:
        text (str): The cleaned text to be split.
        chunk_size (int): The maximum size of each text chunk.
        chunk_overlap (int): The number of overlapping characters between chunks.
        
    Returns:
        list: A list of text chunks.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_text(text)

In [10]:
text_chunks

['ROXBURGH’S Common Skin Diseases 17th Edition Ronald Marks Emeritus Professor of Dermatology and Former Head of Department of Dermatology University of Wales College of Medicine Cardiff,UK Clinical Professor Department of Dermatology and Skin Surgery University of Miami School of Medicine Miami,USA • • Hodder Arnold A member of the Hodder Headline Group London First published in Great Britain in 2003 by Arnold,a member of the Hodder Headline Group, 338 Euston Road,London NW1 3BH',
 'Headline Group, 338 Euston Road,London NW1 3BH http://www.arnoldpublishers.com Distributed in the United States of America by Oxford University Press Inc., 198 Madison Avenue,New York,NY10016 Oxford is a registered trademark of Oxford University Press © 2003 Arnold All rights reserved. No part of this publication may be reproduced or transmitted in any form or by any means,electronically or mechanically,including photocopying,recording or any information storage or retrieval system,without',
 'information 

In [11]:
def get_biobert_embeddings(text):
    """
    Generates embeddings for the given text using the BioBERT model.
    
    Args:
        text (str): The input text to generate embeddings for.
        
    Returns:
        numpy.ndarray: The generated embeddings as a NumPy array.
    """
    # Load BioBERT model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained("monologg/biobert_v1.1_pubmed")
    model = AutoModel.from_pretrained("monologg/biobert_v1.1_pubmed")

    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=512
    )
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Use mean pooling for sentence embedding
    embeddings = torch.mean(outputs.last_hidden_state, dim=1)
    return embeddings.numpy()

In [14]:
# Initialize Pinecone client
pinecone.init(api_key="your-pinecone-api-key")

index_name = "sknai"

# Check if index exists and create it if not
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=1536,  # Embedding dimension
        metric="cosine",  # Similarity metric
        cloud="aws",
        region="us-east-1"
    )

# Connect to the index
index = pinecone.Index(index_name)

In [16]:
def upload_to_pinecone(chunks, batch_size=32):
    """
    Uploads text chunks and their embeddings to Pinecone in batches.
    
    Args:
        chunks (list): A list of text chunks to upload.
        batch_size (int): The number of chunks to upload in each batch.
    """
    batch = []
    for i, chunk in tqdm(enumerate(chunks), desc="Uploading to Pinecone"):
        # Generate embedding
        embedding = get_biobert_embeddings(chunk).tolist()[0]
        
        # Create metadata
        metadata = {
            "text": chunk,
            "chunk_id": str(i)
        }
        
        # Add to batch
        batch.append((str(i), embedding, metadata))
        
        # Upload in batches
        if len(batch) >= batch_size:
            index.upsert(vectors=batch)
            batch = []
    
    # Upload any remaining items
    if len(batch) > 0:
        index.upsert(vectors=batch)

# Usage
upload_to_pinecone(text_chunks)

1406it [05:26,  4.31it/s]
