In [4]:
import os
import sys
import subprocess
import json
import hashlib
import concurrent.futures
import time
from io import BytesIO

# --- 1. Dependency Setup (Auto-Install) ---
def install_dependencies():
    print("System Check: Installing libraries...")
    packages = [
        "python-pptx",
        "easyocr",
        "pillow",
        "pandas",
        "numpy",
        "opencv-python-headless"
    ]
    for package in packages:
        try:
            __import__(package.replace("-", "_").split("_")[0])
        except ImportError:
            print(f"   - Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])
    print("Dependencies ready.\n")

install_dependencies()

import pptx
from pptx.enum.shapes import MSO_SHAPE_TYPE
from pptx.enum.dml import MSO_FILL_TYPE
import easyocr
from PIL import Image
import numpy as np
import cv2

# --- 2. Advanced Computer Vision Pipeline ---

class ImageEnhancer:
    """
    Handles complex image pre-processing using OpenCV to prepare inputs for OCR.
    """
    @staticmethod
    def process_image_variants(blob):
        """
        Generates 3 versions of the image to ensure OCR doesn't miss anything.
        Returns a list of numpy arrays.
        """
        variants = []
        try:
            nparr = np.frombuffer(blob, np.uint8)
            original = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
            if original is None: return []

            # Variant 1: Grayscale + Denoised (Standard)
            gray = cv2.cvtColor(original, cv2.COLOR_BGR2GRAY)
            denoised = cv2.fastNlMeansDenoising(gray, h=10, templateWindowSize=7, searchWindowSize=21)
            variants.append(denoised)

            # Variant 2: CLAHE (Adaptive Contrast) - Good for bad lighting
            clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
            contrast = clahe.apply(gray)
            variants.append(contrast)

            # Variant 3: Otsu's Thresholding (Binarization) - Good for clear text separation
            # Upscale first for small text
            h, w = gray.shape
            scale = 2 if w < 1000 else 1
            if scale > 1:
                resized = cv2.resize(gray, (w*scale, h*scale), interpolation=cv2.INTER_CUBIC)
            else:
                resized = gray
            _, binary = cv2.threshold(resized, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            variants.append(binary)

        except Exception as e:
            pass # Return whatever variants succeeded

        return variants

# --- 3. The OCR Engine (Parallel & Batched) ---

class PowerOCREngine:
    def __init__(self, use_gpu=True):
        print("Loading Neural Network Models (EasyOCR)...")
        self.reader = easyocr.Reader(['en'], gpu=use_gpu) # Add 'fr', 'de', etc. here if needed
        self.blob_cache = {} # Hash -> {text, conf}
        self.queue = []      # List of {hash, variants, slide_idx}

    def queue_image(self, blob, slide_idx):
        if not blob: return

        # Hash to avoid re-processing identical images (logos, footers)
        img_hash = hashlib.md5(blob).hexdigest()

        if img_hash not in self.blob_cache:
            # We haven't seen this image yet. Add to queue.
            # We don't store the blob directly, we store the tasks needed.
            self.queue.append({
                "hash": img_hash,
                "blob": blob,
                "slide_idx": slide_idx
            })

    def run_batch_process(self):
        unique_tasks = {item["hash"]: item["blob"] for item in self.queue}
        total_images = len(unique_tasks)

        if total_images == 0:
            return

        print(f"Starting Parallel OCR on {total_images} unique images...")
        print("   - Step 1: Generating Image Variants (Multi-Core CPU)...")

        # 1. Parallel Pre-processing
        # Using ThreadPoolExecutor is effective here because OpenCV releases GIL for heavy ops
        image_variants_map = {} # hash -> [variant1, variant2, variant3]

        with concurrent.futures.ThreadPoolExecutor() as executor:
            future_to_hash = {executor.submit(ImageEnhancer.process_image_variants, blob): h
                              for h, blob in unique_tasks.items()}

            for future in concurrent.futures.as_completed(future_to_hash):
                h = future_to_hash[future]
                try:
                    variants = future.result()
                    if variants:
                        image_variants_map[h] = variants
                except Exception as exc:
                    print(f"     Image error: {exc}")

        # 2. Batch OCR (GPU)
        print("   - Step 2: Running Neural Networks (GPU Batching)...")

        # Flatten all variants into a single massive batch
        # We need to track which variant belongs to which hash
        batch_input = []
        batch_mapping = [] # index -> hash

        for h, variants in image_variants_map.items():
            for v in variants:
                batch_input.append(v)
                batch_mapping.append(h)

        if not batch_input:
            return

        # Run OCR in batches to respect VRAM
        results = self.reader.readtext_batched(
            batch_input,
            detail=1, # Get confidence scores
            batch_size=8, # Increase if you have a powerful GPU (A100), decrease for T4
            paragraph=False # Must be False to retrieve confidence scores per word
        )

        # 3. Aggregation & Voting
        print("   - Step 3: Aggregating & Selecting Best Results...")

        temp_results = {} # hash -> [(text, confidence), ...]

        for i, result in enumerate(results):
            # Result is a list of [bbox, text, conf]
            # Since paragraph=False, we get detailed items

            text_items = [item[1] for item in result]
            conf_items = [item[2] for item in result]

            if text_items:
                text_found = " ".join(text_items)
                avg_conf = sum(conf_items) / len(conf_items) if conf_items else 0.0

                if len(text_found) > 3: # Ignore noise
                    h = batch_mapping[i]
                    if h not in temp_results: temp_results[h] = []
                    temp_results[h].append((text_found, avg_conf))

        # Select the longest/most complete text from the variants for each hash
        for h, candidates in temp_results.items():
            # candidates is list of (text, conf)
            # Heuristic: Select best extraction based on text length (usually implies more data captured)
            best_candidate = max(candidates, key=lambda x: len(x[0]))

            self.blob_cache[h] = {
                "text": best_candidate[0],
                "conf": best_candidate[1]
            }

# --- 4. The PPTX Parser ---

class PresentationParser:
    def __init__(self, file_path, ocr_engine):
        self.file_path = file_path
        self.ocr = ocr_engine
        self.slides_data = []

    def _extract_text_shape(self, shape):
        """Extracts editable text and tables."""
        items = []

        # Text Frame
        if shape.has_text_frame:
            lines = [p.text.strip() for p in shape.text_frame.paragraphs if p.text.strip()]
            if lines:
                items.append({"type": "text", "content": "\n".join(lines)})

        # Table
        if shape.has_table:
            rows = []
            for row in shape.table.rows:
                cell_texts = [cell.text_frame.text.strip().replace("\n", " ") for cell in row.cells]
                if any(cell_texts): rows.append(cell_texts)
            if rows:
                items.append({"type": "table", "content": rows})

        return items

    def _find_images_recursive(self, shape, slide_idx):
        """Deep recursion to find images in Groups, Backgrounds, etc."""
        # 1. Direct Picture
        if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
            if hasattr(shape, "image"):
                self.ocr.queue_image(shape.image.blob, slide_idx)

        # 2. Shape with Picture Fill
        elif shape.shape_type == MSO_SHAPE_TYPE.AUTO_SHAPE:
            try:
                if shape.fill.type == MSO_FILL_TYPE.PICTURE:
                    self.ocr.queue_image(shape.fill.picture.image.blob, slide_idx)
            except: pass

        # 3. Groups
        elif shape.shape_type == MSO_SHAPE_TYPE.GROUP:
            for sub in shape.shapes:
                self._find_images_recursive(sub, slide_idx)

        # 4. Embedded OLE Objects (e.g., Excel/PDF preview images)
        elif shape.shape_type == MSO_SHAPE_TYPE.EMBEDDED_OLE_OBJECT:
             if hasattr(shape, "image"):
                self.ocr.queue_image(shape.image.blob, slide_idx)

    def parse(self):
        if not os.path.exists(self.file_path):
            return {"error": "File not found"}

        print(f"Parsing PPTX Structure: {self.file_path}")
        prs = pptx.Presentation(self.file_path)

        total_slides = len(prs.slides)
        print(f"   - Found {total_slides} slides.")

        # Phase 1: Structure Extraction & Image Queuing
        for i, slide in enumerate(prs.slides):
            slide_obj = {
                "id": i + 1,
                "content": [],
                "notes": ""
            }

            # Notes
            if slide.has_notes_slide:
                notes = slide.notes_slide.notes_text_frame.text.strip()
                if notes: slide_obj["notes"] = notes

            # Shapes
            for shape in slide.shapes:
                # Get Text
                slide_obj["content"].extend(self._extract_text_shape(shape))
                # Get Images (Queues them, doesn't process yet)
                self._find_images_recursive(shape, i)

            self.slides_data.append(slide_obj)

        # Phase 2: Execute OCR
        self.ocr.run_batch_process()

        # Phase 3: Merge OCR Results
        print("Merging OCR data back into structure...")

        # Create a lookup for images on each slide
        # We need to match the original queue to the cache
        for task in self.ocr.queue:
            h = task["hash"]
            if h in self.ocr.blob_cache:
                slide_idx = task["slide_idx"]
                cache_entry = self.ocr.blob_cache[h]
                text = cache_entry["text"]
                conf = cache_entry["conf"]

                # Deduplication: Don't add if text is identical to something already extracted from shapes
                is_duplicate = any(x['content'] == text for x in self.slides_data[slide_idx]['content'] if isinstance(x['content'], str))

                if not is_duplicate:
                    self.slides_data[slide_idx]['content'].append({
                        "type": "ocr_image",
                        "content": text,
                        "accuracy": round(conf * 100, 2)
                    })

        return self.slides_data

# --- 5. Main Execution ---

def main():
    # Colab File Upload Handler
    try:
        from google.colab import files
        print("Please upload your .pptx file:")
        uploaded = files.upload()
        if not uploaded: return
        filename = next(iter(uploaded))
    except ImportError:
        print("Not running in Colab. Using local file 'input.pptx' if available.")
        filename = "input.pptx" # Change this for local testing

    if not os.path.exists(filename):
        print("No file found to process.")
        return

    # Initialize Engines
    ocr_engine = PowerOCREngine(use_gpu=True)
    parser = PresentationParser(filename, ocr_engine)

    # Run
    start_time = time.time()
    result = parser.parse()
    duration = time.time() - start_time

    # Save
    out_file = f"{filename}_power_extract.json"
    with open(out_file, "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)

    print(f"\nCOMPLETED in {duration:.2f} seconds.")
    print(f"Output saved to: {out_file}")

    # Preview
    print("\n--- JSON DATA PREVIEW (First 2 Slides) ---")
    print(json.dumps(result[:2], indent=2, ensure_ascii=False))

if __name__ == "__main__":
    main()

System Check: Installing libraries...
   - Installing python-pptx...
   - Installing pillow...
   - Installing opencv-python-headless...
Dependencies ready.

Please upload your .pptx file:




Saving EV-Charging-Infrastructure.pdf.pptx to EV-Charging-Infrastructure.pdf (3).pptx
Loading Neural Network Models (EasyOCR)...
Parsing PPTX Structure: EV-Charging-Infrastructure.pdf (3).pptx
   - Found 10 slides.
Merging OCR data back into structure...

COMPLETED in 0.10 seconds.
Output saved to: EV-Charging-Infrastructure.pdf (3).pptx_power_extract.json

--- JSON DATA PREVIEW (First 2 Slides) ---
[
  {
    "id": 1,
    "content": [
      {
        "type": "text",
        "content": "EV Charging Infrastructure\nBuilding India's Electric Future\nComprehensive business model for scalable EV charging network\ndeployment across Indian markets"
      }
    ],
    "notes": ""
  },
  {
    "id": 2,
    "content": [
      {
        "type": "text",
        "content": "Our Product Portfolio"
      },
      {
        "type": "text",
        "content": "Fast DC Chargers"
      },
      {
        "type": "text",
        "content": "Smart Software Platform"
      },
      {
        "type": "text",