# OCR Extraction

Use Azure Form Recognizer or Tika/pdfplumber to extract text from scanned PDFs.

In [None]:
import pdfplumber
import re
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

bronze_df = spark.read.format("delta").load(bronze_path)

def extract_text(bytes_data, filename):
    try:
        if filename.lower().endswith(".pdf"):
            with pdfplumber.open(io.BytesIO(bytes_data)) as pdf:
                return "\n".join([page.extract_text() or "" for page in pdf.pages])
        elif filename.lower().endswith(".txt"):
            return bytes_data.decode("utf-8", errors="ignore")
        elif filename.lower().endswith(".docx"):
            import docx
            import io
            document = docx.Document(io.BytesIO(bytes_data))
            return "\n".join([p.text for p in document.paragraphs])
        else:
            return ""
    except Exception as e:
        return ""

extract_text_udf = udf(extract_text, StringType())

silver_df = bronze_df.withColumn("text_raw", extract_text_udf(col("content"), col("filename")))
