In [11]:
# Imports 
from pdf2image import convert_from_path
import pytesseract
import os 
from tqdm.autonotebook import tqdm
from markitdown import MarkItDown
from huggingface_hub import InferenceClient
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

[nltk_data] Downloading package punkt to /Users/jeff/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
# Initialize embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Functions

def extract_text_from_pdf(pdf_path):
    md = MarkItDown()
    result = md.convert(pdf_path)
    return result.text_content

def ocr_text_from_pdf(pdf_path):
    pages = convert_from_path(pdf_path, 300)
    text = ''
    for page in pages:
        text += pytesseract.image_to_string(page)
    return text

def chunk_text(text, max_words=300):
    sentences = sent_tokenize(text)
    chunks, current_chunk = [], []
    word_count = 0
    for sentence in sentences:
        words = sentence.split()
        if word_count + len(words) > max_words:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            word_count = 0
        current_chunk.append(sentence)
        word_count += len(words)
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

def retrieve_chunks(query, index, chunk_map, top_k=5):
    query_vec = embedding_model.encode([query])
    D, I = index.search(np.array(query_vec), top_k)
    return [chunk_map[i] for i in I[0]]

def sba_text_inference(client, query, index, chunk_map):
    relevant_chunks = retrieve_chunks(query, index, chunk_map)
    context = "\n\n".join(relevant_chunks)
    full_prompt = f"""
    You are an FDA reviewer and expert pharmacologist.
    Context extracted from a Summary Basis of Approval:

    {context}

    Extract the following:
    Drug generic name:  
    i. Exposure-safety analyses conducted to support dose selection  
    ii. Exposure-efficacy analyses conducted to support dose selection  
    iii. Post-marketing requirements related to dose selection  
    iv. Maximum Tolerated Dose (MTD)

    Summarize in ≤200 words.
    """
    output = client.chat.completions.create(
        messages=[{"role": "system", "content": "Be concise."},
                  {"role": "user", "content": full_prompt}],
        stream=False,
        max_tokens=2048,
    )
    return output.choices[0].message.content

In [13]:
# Main block
folder = 'SBA_PDF'
pdf_text = []
pdf_chunks = []
for name in tqdm(os.listdir(folder)):
    pdf = folder + '/' + name
    text = extract_text_from_pdf(pdf)
    if len(text.strip()) < 1: 
        text = ocr_text_from_pdf(pdf)
    pdf_text.append(text)
    chunks = chunk_text(text)
    pdf_chunks.extend(chunks)
    print(name, text[:100])

# Build FAISS index
embeddings = embedding_model.encode(pdf_chunks, convert_to_tensor=False)
dimension = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
chunk_map = {i: chunk for i, chunk in enumerate(pdf_chunks)}

# Run RAG-enhanced inference
client = InferenceClient(model= "Qwen/QwQ-32B")
inference_outputs = []
for _ in pdf_text:
    result = sba_text_inference(client, "Extract dose selection information and MTD", index, chunk_map)
    inference_outputs.append(result)


  0%|          | 0/13 [00:00<?, ?it/s]

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/jeff/nltk_data'
    - '/opt/homebrew/Caskroom/miniforge/base/envs/cdd204/nltk_data'
    - '/opt/homebrew/Caskroom/miniforge/base/envs/cdd204/share/nltk_data'
    - '/opt/homebrew/Caskroom/miniforge/base/envs/cdd204/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
