# Chapter 4. Spark SQL and DataFrames: Introduction to Built-in Data Sources

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
spark = SparkSession.builder.config("spark.driver.memory", "3g").appName("SparkSQLExampleApp").getOrCreate()
spark

## Using Spark SQL in Spark Applications

In [None]:
FLIGHTS_DATA = "../data/departuredelays.csv"
FLIGHTS_SCHEMA = "`date` STRING, `delay` INT, `distance` INT, `origin` STRING, `destination` STRING"

In [None]:
df = (spark.read
      .format("csv")
      .schema(FLIGHTS_SCHEMA)
      .option("header", "true")
      .load(FLIGHTS_DATA))
df.createOrReplaceTempView("us_delay_flights_tbl")

In [None]:
df.show(10)

In [None]:
# Find flights whose distance is greater than 1000 miles
spark.sql("select * from us_delay_flights_tbl where distance > 1000 order by distance desc").show(10)

In [None]:
# Find all flights between San Francisco (SFO) and Chicago (ORD) with at least a two-hour delay
(spark.sql("select to_date(date, 'MMddHHmm') as date_converted, count(delay) as delay_count from us_delay_flights_tbl "
           "where delay >= 120 and origin = 'SFO' and destination = 'ORD' "
           "group by date_converted "
           "order by delay_count desc")
 .show(10))

In [None]:
# Add a column with delay indicators, e.g., Very Long Delays (> 6 hours), Long Delays (2–6 hours), etc.
spark.sql("""
SELECT delay, origin, destination,
CASE
  WHEN delay > 360 THEN 'Very Long Delays'
  WHEN delay > 120 AND delay <= 360 THEN 'Long Delays'
  WHEN delay > 60 AND delay <= 120 THEN 'Short Delays'
  WHEN delay > 0 and delay <= 60 THEN 'Tolerable Delays'
  WHEN delay = 0 THEN 'No Delays'
  ELSE 'Early'
END AS Flight_Delays
FROM us_delay_flights_tbl
ORDER BY origin, delay DESC
""").show(10)

In [None]:
# The above SQL queries translated into the structured API
df.where(F.col("distance") > 1000).orderBy(F.desc("distance")).show(10)

In [None]:
(df
 .select(F.to_date("date", "MMddHHmm").alias("date_converted"), "delay") 
 .where((F.col("delay") >= 120) & (F.col("origin") == "SFO"))
 .where(F.col("destination") == "ORD")
 .groupBy("date_converted")
 .agg(F.count("delay").alias("delay_count"))
 .orderBy(F.desc("delay_count"))
 .show(10))

In [None]:
(df
 .select("delay", "origin", "destination")
 .withColumn("Flight_Delays", 
             F.when(df.delay > 360, "Very Long Delays")
             .when((df.delay > 120) & (df.delay <= 360), "Long Delays")
             .when((df.delay > 60) & (df.delay <= 120), "Short Delays")
             .when((df.delay > 0) & (df.delay <= 60), "Tolerable Delays")
             .when(df.delay == 0, "No Delays")
             .otherwise("Early")
            )
 .orderBy(df.origin, F.desc(df.delay))
 .show(10))

## SQL Tables and Views

In [None]:
spark.sql("create database if not exists learn_spark_db")
spark.sql("use learn_spark_db")

In [None]:
# Create a managed table
# In SQL
# spark.sql("CREATE TABLE managed_us_delay_flights_tbl (date STRING, delay INT, distance INT, origin STRING, destination STRING)")
# In Structured API
df.write.mode("overwrite").saveAsTable("managed_us_delay_flights_tbl")

In [None]:
# Create an unmanaged table
df.write.mode("overwrite").option("path", "./data_output/us_flights_delay").saveAsTable("us_delay_flights_tbl")

In [None]:
# Create global and session-scoped temporary views containing a slice of the flights table
df_sfo = spark.sql("SELECT date, delay, origin, destination FROM us_delay_flights_tbl WHERE origin = 'SFO'")
df_jfk = spark.sql("SELECT date, delay, origin, destination FROM us_delay_flights_tbl WHERE origin = 'JFK'")

df_sfo.createOrReplaceGlobalTempView("us_origin_airport_SFO_global_tmp_view")
df_jfk.createOrReplaceTempView("us_origin_airport_JFK_tmp_view")

In [None]:
# Access global tem view
spark.read.table("global_temp.us_origin_airport_SFO_global_tmp_view").show(2)
spark.sql("SELECT * FROM global_temp.us_origin_airport_SFO_global_tmp_view").show(2)

In [None]:
# Access session-scoped temp view
spark.read.table("us_origin_airport_JFK_tmp_view").show(2)
spark.sql("SELECT * FROM us_origin_airport_JFK_tmp_view").show(2)

In [None]:
# Drop temp views
spark.catalog.dropGlobalTempView("us_origin_airport_SFO_global_tmp_view")
spark.catalog.dropTempView("us_origin_airport_JFK_tmp_view")

In [None]:
# View metadata
print(spark.catalog.listDatabases())
print(spark.catalog.listTables())
print(spark.catalog.listColumns("us_delay_flights_tbl"))

In [None]:
# Reading Tables into DataFrames
us_flights_df = spark.sql("select * from us_delay_flights_tbl")
us_flights_df2 = spark.table("us_delay_flights_tbl")

us_flights_df.show(2)
us_flights_df2.show(2)

## Data Sources for DataFrames and SQL Tables

In [None]:
# Reading Parquet files into a DataFrame
path = "./data_output/us_flights_delay/"
df2 = spark.read.format("parquet").load(path)
df2.show(2)

In [None]:
# Reading Parquet files into a Spark SQL table
spark.sql("create or replace temporary view unmanaged2_us_delay_flights_tbl "
          "using parquet options (path './data_output/us_flights_delay/')")

In [None]:
spark.sql("SELECT * FROM unmanaged2_us_delay_flights_tbl").show()

In [None]:
spark.stop()