### In this section, you will know how to create temporary views and tablesfrom the existing built-in data sources.Whether you’re using the DataFrame API or SQL, the queries produce identical outcomes.

## This notebook shows how to use SQL on a US Flights Dataset dataset.

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = (SparkSession
    .builder
    .appName("flight_analysis")
    .getOrCreate())

### Define a UDF to convert the date format into a readable format.

Note: the date is a string with year missing, so it might be difficult to do any queries using SQL year() function

In [4]:
def to_date_format_udf(d_str):
  l = [char for char in d_str]
  return "".join(l[0:2]) + "/" +  "".join(l[2:4]) + " " + " " +"".join(l[4:6]) + ":" + "".join(l[6:])

to_date_format_udf("02190925")# how timestamps are in data to Mm/Yy time

'02/19  09:25'

In [8]:
spark.udf.register("to_date_format_udf", to_date_format_udf, StringType())

<function __main__.to_date_format_udf(d_str)>

In [9]:
#Read our US departure flight data
df = (spark.read.format("csv")
      .schema("date STRING, delay INT, distance INT, origin STRING, destination STRING")
      .option("header", "true")
      .option("path", r"C:\Users\syed3\Downloads\LearningSparkV2-master\LearningSparkV2-master\databricks-datasets\learning-spark-v2\flights\departuredelays.csv")
      .load())

display(df)

DataFrame[date: string, delay: int, distance: int, origin: string, destination: string]

In [11]:
df.selectExpr("to_date_format_udf(date) as data_format").show(10, truncate=False)

+------------+
|data_format |
+------------+
|01/01  12:45|
|01/02  06:00|
|01/02  12:45|
|01/02  06:05|
|01/03  12:45|
|01/03  06:05|
|01/04  12:43|
|01/04  06:05|
|01/05  12:45|
|01/05  06:05|
+------------+
only showing top 10 rows



In [12]:
#Create a temporary view to which we can issue SQL queries
df.createOrReplaceTempView("us_delay_flights_tbl")

In [21]:
spark.sql('select * from us_delay_flights_tbl ').show(10, truncate =False)

+--------+-----+--------+------+-----------+
|date    |delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|6    |602     |ABE   |ATL        |
|01020600|-8   |369     |ABE   |DTW        |
|01021245|-2   |602     |ABE   |ATL        |
|01020605|-4   |602     |ABE   |ATL        |
|01031245|-4   |602     |ABE   |ATL        |
|01030605|0    |602     |ABE   |ATL        |
|01041243|10   |602     |ABE   |ATL        |
|01040605|28   |602     |ABE   |ATL        |
|01051245|88   |602     |ABE   |ATL        |
|01050605|9    |602     |ABE   |ATL        |
+--------+-----+--------+------+-----------+
only showing top 10 rows



In [15]:
#Convert all date to date_fm so it's more convenient

spark.sql("SELECT *, date, to_date_format_udf(date) AS date_fm FROM us_delay_flights_tbl").show(
    10, truncate=False)

+--------+-----+--------+------+-----------+--------+------------+
|date    |delay|distance|origin|destination|date    |date_fm     |
+--------+-----+--------+------+-----------+--------+------------+
|01011245|6    |602     |ABE   |ATL        |01011245|01/01  12:45|
|01020600|-8   |369     |ABE   |DTW        |01020600|01/02  06:00|
|01021245|-2   |602     |ABE   |ATL        |01021245|01/02  12:45|
|01020605|-4   |602     |ABE   |ATL        |01020605|01/02  06:05|
|01031245|-4   |602     |ABE   |ATL        |01031245|01/03  12:45|
|01030605|0    |602     |ABE   |ATL        |01030605|01/03  06:05|
|01041243|10   |602     |ABE   |ATL        |01041243|01/04  12:43|
|01040605|28   |602     |ABE   |ATL        |01040605|01/04  06:05|
|01051245|88   |602     |ABE   |ATL        |01051245|01/05  12:45|
|01050605|9    |602     |ABE   |ATL        |01050605|01/05  06:05|
+--------+-----+--------+------+-----------+--------+------------+
only showing top 10 rows



In [23]:
#Find out all flights whose distance between origin and destination is greater than 1000
spark.sql(
"SELECT distance, origin, destination FROM us_delay_flights_tbl WHERE distance > 1000 ORDER BY distance DESC").show(10, truncate=False)

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
+--------+------+-----------+
only showing top 10 rows



In [26]:
# query in data frame
#df.select("distance", "origin", "destination").where("distance > 1000").orderBy("distance", ascending=False).show(10)
df.select(
"distance", "origin", "destination").where(col("distance") > 1000).orderBy(desc("distance")).show(10, truncate=False)

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
|4330    |HNL   |JFK        |
+--------+------+-----------+
only showing top 10 rows



In [27]:
#Find out all flights with 2 hour delays between San Francisco and Chicago
spark.sql("""
SELECT date, delay, origin, destination 
FROM us_delay_flights_tbl 
WHERE delay > 120 AND ORIGIN = 'SFO' AND DESTINATION = 'ORD' 
ORDER by delay DESC
""").show(10, truncate=False)

+--------+-----+------+-----------+
|date    |delay|origin|destination|
+--------+-----+------+-----------+
|02190925|1638 |SFO   |ORD        |
|01031755|396  |SFO   |ORD        |
|01022330|326  |SFO   |ORD        |
|01051205|320  |SFO   |ORD        |
|01190925|297  |SFO   |ORD        |
|02171115|296  |SFO   |ORD        |
|01071040|279  |SFO   |ORD        |
|01051550|274  |SFO   |ORD        |
|03120730|266  |SFO   |ORD        |
|01261104|258  |SFO   |ORD        |
+--------+-----+------+-----------+
only showing top 10 rows



In [28]:
#A more complicated query in SQL, let's label all US flights originating from airports with high, medium, low, no delays, regardless of destinations.
spark.sql("""SELECT delay, origin, destination,
              CASE
                  WHEN delay > 360 THEN 'Very Long Delays'
                  WHEN delay > 120 AND delay < 360 THEN  'Long Delays '
                  WHEN delay > 60 AND delay < 120 THEN  'Short Delays'
                  WHEN delay > 0 and delay < 60  THEN   'Tolerable Delays'
                  WHEN delay = 0 THEN 'No Delays'
                  ELSE 'No Delays'
               END AS Flight_Delays
               FROM us_delay_flights_tbl
               ORDER BY origin, delay DESC""").show(10, truncate=False)

+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|333  |ABE   |ATL        |Long Delays  |
|305  |ABE   |ATL        |Long Delays  |
|275  |ABE   |ATL        |Long Delays  |
|257  |ABE   |ATL        |Long Delays  |
|247  |ABE   |ATL        |Long Delays  |
|247  |ABE   |DTW        |Long Delays  |
|219  |ABE   |ORD        |Long Delays  |
|211  |ABE   |ATL        |Long Delays  |
|197  |ABE   |DTW        |Long Delays  |
|192  |ABE   |ORD        |Long Delays  |
+-----+------+-----------+-------------+
only showing top 10 rows



In [13]:
spark.stop()

In [19]:
# TO have access to hive metastore in jupyter notebook or any ide
spark = SparkSession.builder.appName("my_app").enableHiveSupport().getOrCreate()

In [22]:
spark.sql('DROP DATABASE IF EXISTS learn_spark_db CASCADE ')

DataFrame[]

In [25]:
spark.sql("CREATE DATABASE if not exists learn_spark_db")
spark.sql("USE learn_spark_db")

DataFrame[]

In [5]:
spark.sql('DROP TABLE IF EXISTS managed_us_delay_flights_tbl')

DataFrame[]

In [8]:
spark.sql(
"CREATE TABLE managed_us_delay_flights_tbl (date STRING, delay INT,distance INT, origin STRING, destination STRING)")

DataFrame[]

In [26]:
# creating table same using dataframe API
# Path to our US flight delays CSV file
csv_file = r"C:\Users\syed3\Downloads\LearningSparkV2-master\LearningSparkV2-master\databricks-datasets\learning-spark-v2\flights\departuredelays.csv"
# Schema as defined in the preceding example
schema="date STRING, delay INT, distance INT, origin STRING, destination STRING"
flights_df = spark.read.csv(csv_file, schema=schema)
flights_df.write.saveAsTable("us_delay_flights_tbl")

In [27]:
# temporary view for san francisco and newyork
df_sfo = spark.sql("SELECT date, delay, origin, destination FROM us_delay_flights_tbl WHERE origin = 'SFO'")
df_jfk = spark.sql("SELECT date, delay, origin, destination FROM us_delay_flights_tbl WHERE origin = 'JFK'")

In [28]:
# Create a temporary and global temporary view
df_sfo.createOrReplaceGlobalTempView("us_origin_airport_SFO_global_tmp_view")
df_jfk.createOrReplaceTempView("us_origin_airport_JFK_tmp_view")


In [29]:
spark.read.table("us_origin_airport_JFK_tmp_view").show(10,False)

+--------+-----+------+-----------+
|date    |delay|origin|destination|
+--------+-----+------+-----------+
|03010900|-11  |JFK   |LAX        |
|03011200|-3   |JFK   |LAX        |
|03010655|-3   |JFK   |LAX        |
|03011030|47   |JFK   |LAX        |
|03011900|50   |JFK   |LAX        |
|03010800|10   |JFK   |LAX        |
|03011700|1    |JFK   |LAS        |
|03010800|-4   |JFK   |SFO        |
|03011540|-3   |JFK   |DFW        |
|03011710|3    |JFK   |SAN        |
+--------+-----+------+-----------+
only showing top 10 rows



In [33]:
#Spark creates global temporaryviews in a global temporary database called global_temp
spark.sql("SELECT * FROM global_temp.us_origin_airport_SFO_global_tmp_view").show(10)

+--------+-----+------+-----------+
|    date|delay|origin|destination|
+--------+-----+------+-----------+
|01011250|   55|   SFO|        JFK|
|01012230|    0|   SFO|        JFK|
|01010705|   -7|   SFO|        JFK|
|01010620|   -3|   SFO|        MIA|
|01010915|   -3|   SFO|        LAX|
|01011005|   -8|   SFO|        DFW|
|01011800|    0|   SFO|        ORD|
|01011740|   -7|   SFO|        LAX|
|01012015|   -7|   SFO|        LAX|
|01012110|   -1|   SFO|        MIA|
+--------+-----+------+-----------+
only showing top 10 rows



In [39]:
spark.sql('DROP VIEW IF EXISTS us_origin_airport_SFO_global_tmp_view')
spark.sql('DROP VIEW IF EXISTS us_origin_airport_JFK_tmp_view')

DataFrame[]

In [41]:
spark.catalog.dropGlobalTempView("us_origin_airport_SFO_global_tmp_view")
spark.catalog.dropTempView("us_origin_airport_JFK_tmp_view")

In [46]:
#VIEWING THE METADATA INSIDE THE DATABASE,TABLE
spark.catalog.listDatabases()
# spark.catalog.listTables()
# spark.catalog.listColumns("us_delay_flights_tbl")

[Database(name='default', description='Default Hive database', locationUri='file:/C:/Users/syed3/Downloads/Python/Python/Chapter04/spark-warehouse'),
 Database(name='learn_spark_db', description='', locationUri='file:/C:/Users/syed3/Downloads/Python/Python/Chapter04/spark-warehouse/learn_spark_db.db')]