#SparkSQL

Working with PySpark's built-in SQL compliant functionality to investigate flights-delay data at scale


*   Understand the limitations of SparkSQL
*   Experiment with createOrReplaceGlobalView
*   Work on same executions through pyspark methods



In [None]:
#Download Data from source
Data Source Repo - https://kloudbitbucket.s3.amazonaws.com/krunal_ds/departuredelays.csv

In [1]:
#Import necessary spark components
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=8120513f922b1381569868f171bb5c2e3bb4040a2bc34c3147319d6ff2dc4bdf
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [3]:
from pyspark.sql import SparkSession


In [5]:
#Create a SparkSession
lti_spark = SparkSession.builder.appName('giri3').getOrCreate()
lti_spark

In [9]:
#Import Data into Spark Native Dataframe
df = lti_spark.read.option('Header','True').csv('/departuredelays.csv')
df.show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
|01030605|    0|     602|   ABE|        ATL|
|01041243|   10|     602|   ABE|        ATL|
|01040605|   28|     602|   ABE|        ATL|
|01051245|   88|     602|   ABE|        ATL|
|01050605|    9|     602|   ABE|        ATL|
|01061215|   -6|     602|   ABE|        ATL|
|01061725|   69|     602|   ABE|        ATL|
|01061230|    0|     369|   ABE|        DTW|
|01060625|   -3|     602|   ABE|        ATL|
|01070600|    0|     369|   ABE|        DTW|
|01071725|    0|     602|   ABE|        ATL|
|01071230|    0|     369|   ABE|        DTW|
|01070625|    0|     602|   ABE|        ATL|
|01071219|    0|     569|   ABE|        ORD|
|01080600|

In [10]:
#Convert into an SQL complaint format
d = df
d.createOrReplaceTempView('delays')


In [13]:
#Display 100 rows of data with SQL query
lti_spark.sql("select * from delays ").show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
|01030605|    0|     602|   ABE|        ATL|
|01041243|   10|     602|   ABE|        ATL|
|01040605|   28|     602|   ABE|        ATL|
|01051245|   88|     602|   ABE|        ATL|
|01050605|    9|     602|   ABE|        ATL|
|01061215|   -6|     602|   ABE|        ATL|
|01061725|   69|     602|   ABE|        ATL|
|01061230|    0|     369|   ABE|        DTW|
|01060625|   -3|     602|   ABE|        ATL|
|01070600|    0|     369|   ABE|        DTW|
|01071725|    0|     602|   ABE|        ATL|
|01071230|    0|     369|   ABE|        DTW|
|01070625|    0|     602|   ABE|        ATL|
|01071219|    0|     569|   ABE|        ORD|
|01080600|

In [16]:
#Get a list of all Origin and Destination airports
lti_spark.sql("select origin, destination from delays").show()

+------+-----------+
|origin|destination|
+------+-----------+
|   ABE|        ATL|
|   ABE|        DTW|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        DTW|
|   ABE|        ATL|
|   ABE|        DTW|
|   ABE|        ATL|
|   ABE|        DTW|
|   ABE|        ATL|
|   ABE|        ORD|
|   ABE|        DTW|
+------+-----------+
only showing top 20 rows



In [20]:
#Find the top 5 longest distance travel routes
lti_spark.sql("select * from delays order by distance limit 5").show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|03010610|   -7|     100|   BDL|        EWR|
|01071321|    0|     100|   BDL|        EWR|
|03041010|   -9|     100|   BDL|        EWR|
|01081015|   42|     100|   BDL|        EWR|
|03020610|    4|     100|   BDL|        EWR|
+--------+-----+--------+------+-----------+



In [21]:
df.show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
|01030605|    0|     602|   ABE|        ATL|
|01041243|   10|     602|   ABE|        ATL|
|01040605|   28|     602|   ABE|        ATL|
|01051245|   88|     602|   ABE|        ATL|
|01050605|    9|     602|   ABE|        ATL|
|01061215|   -6|     602|   ABE|        ATL|
|01061725|   69|     602|   ABE|        ATL|
|01061230|    0|     369|   ABE|        DTW|
|01060625|   -3|     602|   ABE|        ATL|
|01070600|    0|     369|   ABE|        DTW|
|01071725|    0|     602|   ABE|        ATL|
|01071230|    0|     369|   ABE|        DTW|
|01070625|    0|     602|   ABE|        ATL|
|01071219|    0|     569|   ABE|        ORD|
|01080600|

In [43]:
#Find top 3 origin airport with maximum flight delay occurances
lti_spark.sql("select origin, count(delay) from delays where delay>0 group by origin order by count(delay) desc limit 3").show()

+------+------------+
|origin|count(delay)|
+------+------------+
|   ATL|       41828|
|   ORD|       33812|
|   DEN|       30760|
+------+------------+



In [44]:
#Find top 3 destination airport with maximum flight delay occurances
lti_spark.sql("select destination, count(delay) from delays where delay>0 group by destination order by count(delay) desc limit 3").show()

+-----------+------------+
|destination|count(delay)|
+-----------+------------+
|        ATL|       31973|
|        ORD|       24425|
|        DEN|       24001|
+-----------+------------+



In [42]:
#Find the route with maximum delay occurances
lti_spark.sql("select origin, destination, count(delay) from delays where delay>0 group by origin,destination order by count(delay) desc").limit(1).show()

+------+-----------+------------+
|origin|destination|count(delay)|
+------+-----------+------------+
|   LAX|        SFO|        1540|
+------+-----------+------------+



In [45]:
#Find the top three routes with maximum time-delay
lti_spark.sql("select origin, destination from delays order by delay desc").limit(3).show()

+------+-----------+
|origin|destination|
+------+-----------+
|   SMF|        SLC|
|   SJC|        ORD|
|   MOT|        DEN|
+------+-----------+



In [48]:
#Find the distance for top three max time-delay routes
lti_spark.sql("select * from delays order by delay desc").limit(3).show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01090600|  995|     462|   SMF|        SLC|
|03191420|  994|    1590|   SJC|        ORD|
|01200645|  993|     525|   MOT|        DEN|
+--------+-----+--------+------+-----------+

