In [89]:
# %% Browser-Compatible PDF Analysis (No Shell Commands)
import micropip
await micropip.install(["pypdf", "scikit-learn", "ipywidgets", "numpy"])

from pypdf import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import ipywidgets as widgets
from IPython.display import display
import numpy as np
from io import BytesIO

class PDFAnalyzer:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.nn = NearestNeighbors(n_neighbors=3)
        self.chunks = []
        
    def process_pdf(self, file_content, chunk_size=1000):
        """Process PDF with memory-efficient chunking"""
        reader = PdfReader(BytesIO(file_content))
        text = []
        current_chunk = []
        current_length = 0
        
        for page in reader.pages:
            page_text = (page.extract_text() or "").split()
            for word in page_text:
                if current_length >= chunk_size:
                    self.chunks.append(' '.join(current_chunk))
                    current_chunk = []
                    current_length = 0
                current_chunk.append(word)
                current_length += 1
        if current_chunk:
            self.chunks.append(' '.join(current_chunk))
            
        # Train model
        tfidf_matrix = self.vectorizer.fit_transform(self.chunks)
        self.nn.fit(tfidf_matrix)
        
    def query(self, question):
        """Find relevant sections"""
        question_vec = self.vectorizer.transform([question])
        distances, indices = self.nn.kneighbors(question_vec)
        return [(self.chunks[i], 1-dist) for i, dist in zip(indices[0], distances[0])]

# UI Components
analyzer = PDFAnalyzer()
uploader = widgets.FileUpload(accept='.pdf', multiple=True)
process_btn = widgets.Button(description="Process PDFs")
question_input = widgets.Text(placeholder='Ask about documents...')
output = widgets.Output()

def handle_upload(change):
    with output:
        output.clear_output()
        for file in uploader.value:
            try:
                analyzer.process_pdf(file['content'])
                print(f"Processed {file['name']} - {len(file['content'])//1024}KB")
            except Exception as e:
                print(f"Error processing {file['name']}: {str(e)}")

def handle_question(change):
    with output:
        output.clear_output()
        if not analyzer.chunks:
            print("Process PDFs first!")
            return
            
        results = analyzer.query(question_input.value)
        print(f"Top {len(results)} relevant sections:")
        for i, (text, score) in enumerate(results):
            print(f"\n📚 Section {i+1} (Relevance: {score:.2%})")
            print(text[:500] + ("..." if len(text) > 500 else ""))
            print("━" * 50)

# Display UI
uploader.observe(handle_upload, names='value')
question_input.observe(handle_question, names='value')
display(widgets.VBox([
    uploader,
    process_btn,
    question_input,
    output
]))

VBox(children=(FileUpload(value=(), accept='.pdf', description='Upload', multiple=True), Button(description='P…