### Reading parquet data with an inferred schema

In [6]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .appName("read-parquet-data")
         .master("spark://spark-master:7077")
         .config("spark.executor.memory", "512m")
         .getOrCreate())

spark.sparkContext.setLogLevel("ERROR")

In [8]:
# Read parquet file into a DataFrame
df = (spark.read.format("parquet")
      .load("../data/recipes.parquet"))

                                                                                

In [9]:
df.printSchema()

root
 |-- RecipeId: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- AuthorId: integer (nullable = true)
 |-- AuthorName: string (nullable = true)
 |-- CookTime: string (nullable = true)
 |-- PrepTime: string (nullable = true)
 |-- TotalTime: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- RecipeCategory: string (nullable = true)
 |-- Keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientQuantities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientParts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- AggregatedRating: double (nullable = true)
 |-- ReviewCount: integer (nullable = true)
 |-- Calories: double (nullable = true)
 |-- FatContent: double (nullable = true)
 |-- SaturatedFatContent: double (nullable = true)
 |-- CholesterolContent: double (nullable = true)
 |-- SodiumContent: double (nullable = true)
 |-- Carbohydr

In [10]:
# Display contents of DataFrame
df.show()

# Alternatively

# df.show(50)  # Display first 50 rows
# df.show(10, truncate=False)  # Display first 10 rows without truncation

                                                                                

+--------+--------------------+----------+------------+--------+--------+---------+--------------------+--------------+--------------------+--------------------------+---------------------+----------------+-----------+--------+----------+-------------------+------------------+-------------+-------------------+------------+------------+--------------+--------------+-----------+--------------------+--------------------+-------------+
|RecipeId|                Name|  AuthorId|  AuthorName|CookTime|PrepTime|TotalTime|         Description|RecipeCategory|            Keywords|RecipeIngredientQuantities|RecipeIngredientParts|AggregatedRating|ReviewCount|Calories|FatContent|SaturatedFatContent|CholesterolContent|SodiumContent|CarbohydrateContent|FiberContent|SugarContent|ProteinContent|RecipeServings|RecipeYield|  RecipeInstructions|              Images|DatePublished|
+--------+--------------------+----------+------------+--------+--------+---------+--------------------+--------------+---------

### Reading partitioned data

In [11]:
df_partitioned = (spark.read.format("parquet")
                  .load("../data/partitioned_recipes"))

                                                                                

In [12]:
df_partitioned.printSchema()

root
 |-- RecipeId: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- AuthorId: integer (nullable = true)
 |-- AuthorName: string (nullable = true)
 |-- CookTime: string (nullable = true)
 |-- PrepTime: string (nullable = true)
 |-- TotalTime: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- RecipeCategory: string (nullable = true)
 |-- Keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientQuantities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientParts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- AggregatedRating: double (nullable = true)
 |-- ReviewCount: integer (nullable = true)
 |-- Calories: double (nullable = true)
 |-- FatContent: double (nullable = true)
 |-- SaturatedFatContent: double (nullable = true)
 |-- CholesterolContent: double (nullable = true)
 |-- SodiumContent: double (nullable = true)
 |-- Carbohydr

In [13]:
df_partitioned = (spark.read.format("parquet")
                  .load("../data/partitioned_recipes/DatePublished=2020-01*"))

In [14]:
df_partitioned.printSchema()

root
 |-- RecipeId: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- AuthorId: integer (nullable = true)
 |-- AuthorName: string (nullable = true)
 |-- CookTime: string (nullable = true)
 |-- PrepTime: string (nullable = true)
 |-- TotalTime: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- RecipeCategory: string (nullable = true)
 |-- Keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientQuantities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientParts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- AggregatedRating: double (nullable = true)
 |-- ReviewCount: integer (nullable = true)
 |-- Calories: double (nullable = true)
 |-- FatContent: double (nullable = true)
 |-- SaturatedFatContent: double (nullable = true)
 |-- CholesterolContent: double (nullable = true)
 |-- SodiumContent: double (nullable = true)
 |-- Carbohydr

### Schema Merging

In [15]:
df_merged_schema = (spark.read.format("parquet")
                    .option("mergeSchema", "true")
                    .load("../data/partitioned_recipes"))

                                                                                

In [16]:
df_merged_schema.printSchema()

root
 |-- RecipeId: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- AuthorId: integer (nullable = true)
 |-- AuthorName: string (nullable = true)
 |-- CookTime: string (nullable = true)
 |-- PrepTime: string (nullable = true)
 |-- TotalTime: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- RecipeCategory: string (nullable = true)
 |-- Keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientQuantities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- RecipeIngredientParts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- AggregatedRating: double (nullable = true)
 |-- ReviewCount: integer (nullable = true)
 |-- Calories: double (nullable = true)
 |-- FatContent: double (nullable = true)
 |-- SaturatedFatContent: double (nullable = true)
 |-- CholesterolContent: double (nullable = true)
 |-- SodiumContent: double (nullable = true)
 |-- Carbohydr

In [17]:
# Stop the Spark Session
spark.stop()