# Working with PDF Files

In [0]:
%pip install PyPDF2
dbutils.library.restartPython()

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType
from io import BytesIO
import PyPDF2

In [0]:
df=spark.read.format('binaryFile').load('{Put Databricks Mounted Path here}')
df.display()

In [0]:
def page_number_counter(bytes_data):
    try:
        # Convert binary data to a file-like object
        pdf_file = BytesIO(bytes_data)
        # Read the PDF
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        # Return number of pages
        return len(pdf_reader.pages)
    except Exception as e:
        # Handle invalid/corrupted files gracefully
        return None

# Register as Spark UDF
page_count_udf = udf(page_number_counter, IntegerType())

# Example usage
df_with_pages = df.withColumn("page_count", page_count_udf("content"))

df_with_pages.display()

# Working with Image Files

In [0]:
df = spark.read.format("binaryFile").load("{Put Databricks Mounted Path here}")

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, IntegerType
from io import BytesIO
from PIL import Image

In [0]:
# Define a single Python function to return both width and height
def get_image_dimensions(bytes_data):
    try:
        img = Image.open(BytesIO(bytes_data))
        return (img.width, img.height)
    except Exception as e:
        raise 

# Define return schema (two integer fields)

dimensions_schema = StructType([

    StructField("width", IntegerType(), True),
    StructField("height", IntegerType(), True)
])

# Register UDF
get_image_dimensions_udf = udf(get_image_dimensions, dimensions_schema)

# Apply UDF and extract both columns
df_with_dimensions = (
    df.withColumn("dimensions", get_image_dimensions_udf("content"))
)

df_with_dimensions = (df_with_dimensions
      .withColumn("image_width",  df_with_dimensions["dimensions"].width)
      .withColumn("image_height", df_with_dimensions["dimensions"].height)
      .drop("dimensions")
)

# Display the result
df_with_dimensions.select("path","image_width","image_height").show(truncate=False)