# OCR Extraction

Use Azure Form Recognizer or Tika/pdfplumber to extract text from scanned PDFs.

In [None]:
%python
%pip install pdfplumber

In [None]:
dbutils.library.restartPython()

In [None]:

import pdfplumber
import re
import io
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

bronze_path = "abfss://bronze@ragstorage4122025.dfs.core.windows.net/"
checkpoint = "dbfs:/checkpoints/contracts/bronze/" 

# Read files as binary
bronze_df = (
    spark.read
    .format("binaryFile")
    .option("recursiveFileLookup", "true")
    .load(bronze_path)
)

def extract_text(bytes_data, filename):
    try:
        if filename.lower().endswith(".pdf"):
            with pdfplumber.open(io.BytesIO(bytes_data)) as pdf:
                return "\n".join([page.extract_text() or "" for page in pdf.pages])
        elif filename.lower().endswith(".txt"):
            return bytes_data.decode("utf-8", errors="ignore")
        elif filename.lower().endswith(".docx"):
            import docx
            document = docx.Document(io.BytesIO(bytes_data))
            return "\n".join([p.text for p in document.paragraphs])
        else:
            return ""
    except Exception:
        return ""

extract_text_udf = udf(extract_text, StringType())

# Use 'content' and 'path' columns from binaryFile source
silver_df = bronze_df.withColumn(
    "text_raw",
    extract_text_udf(col("content"), col("path"))
)

silver_path = "abfss://silver@ragstorage4122025.dfs.core.windows.net/"
silver_df.write.format("delta").mode("overwrite").save(silver_path)