In [1]:
# Activate Spark in our Colab notebook.
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example: 'spark-3.2.2'
spark_version = 'spark-3.2.2'
# spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.2.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.2.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3.2"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Get:13 http://security.ubuntu.com/ubuntu bionic-security/restricted amd64 Packages [1,006 k

In [2]:
# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [3]:
# Read in data from S3 Bucket
from pyspark import SparkFiles
url = "https://2u-data-curriculum-team.s3.amazonaws.com/nflx-data-science-adv/week-5/DelayedFlights.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("DelayedFlights.csv"), sep=",", header=True)
df.show()


+---+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|_c0|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|
+---+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|  0|2008|    1|         3|        4| 2003.0|      1955| 2211.0|      2225|       

In [4]:
#Create temp view named "delays"
df.createOrReplaceTempView('delays')
# Start the runtime
start_time = time.time()
# Using spark.sql write a query that gives you the total distance and the count of every unique Origin, Dest combination
spark.sql("""select Origin, Dest ,sum(Distance), count(*) from delays group by 1,2""").show()
# Print out the runtime.
print("--- %s seconds ---" % (time.time() - start_time))

+------+----+-----------------------------+--------+
|Origin|Dest|sum(CAST(Distance AS DOUBLE))|count(1)|
+------+----+-----------------------------+--------+
|   LAS| LIT|                     147630.0|     114|
|   PHL| MCO|                    1841679.0|    2139|
|   SMF| BUR|                     297140.0|     830|
|   SNA| PHX|                     400868.0|    1186|
|   MCI| IAH|                     267488.0|     416|
|   BFL| SAN|                       5590.0|      26|
|   ROC| CLE|                      78400.0|     320|
|   SPI| ORD|                      55158.0|     317|
|   ATL| GSP|                     107406.0|     702|
|   SFO| PMD|                      67940.0|     215|
|   LAX| OXR|                      10241.0|     209|
|   ORD| PDX|                    1721610.0|     990|
|   PBI| DCA|                     170543.0|     199|
|   FSD| ATL|                      56286.0|      59|
|   MLI| MCO|                      50764.0|      49|
|   MSP| AVL|                      44772.0|   

In [5]:
# Write out the data in parquet format
df.write.parquet('parquet_delay_basic',mode='overwrite')

In [6]:
# Read in our new parquet formatted data
p_df=spark.read.parquet('parquet_delay_basic')

In [7]:
# Convert the dataframe to a view.
p_df.createOrReplaceTempView('p_delays')

In [8]:
# Start the runtime
start_time = time.time()

# Run the same query here

spark.sql("""select Origin, Dest ,sum(Distance), count(*) from p_delays group by 1,2""").show()
# Print out the runtime
print("--- %s seconds ---" % (time.time() - start_time))

+------+----+-----------------------------+--------+
|Origin|Dest|sum(CAST(Distance AS DOUBLE))|count(1)|
+------+----+-----------------------------+--------+
|   LAS| LIT|                     147630.0|     114|
|   PHL| MCO|                    1841679.0|    2139|
|   SMF| BUR|                     297140.0|     830|
|   SNA| PHX|                     400868.0|    1186|
|   MCI| IAH|                     267488.0|     416|
|   BFL| SAN|                       5590.0|      26|
|   ROC| CLE|                      78400.0|     320|
|   SPI| ORD|                      55158.0|     317|
|   ATL| GSP|                     107406.0|     702|
|   SFO| PMD|                      67940.0|     215|
|   LAX| OXR|                      10241.0|     209|
|   ORD| PDX|                    1721610.0|     990|
|   PBI| DCA|                     170543.0|     199|
|   FSD| ATL|                      56286.0|      59|
|   MLI| MCO|                      50764.0|      49|
|   MSP| AVL|                      44772.0|   

In [9]:
# Write out your parquet data, partitioning on the Origin column
df.write.partitionBy("Origin").mode("overwrite").parquet("delayed_partitioned")

In [10]:
# Read in our new parquet formatted data
p_df_p=spark.read.parquet('delayed_partitioned')

In [11]:
# Convert the dataframe to a view.
p_df_p.createOrReplaceTempView('p_delays_p')

In [12]:
# Start the runtime
start_time = time.time()

# Run your query against your partitioned data one more time.
spark.sql("""select Origin, Dest ,sum(Distance), count(*) from p_delays_p group by 1,2""").show()
# Print out the runtime
print("--- %s seconds ---" % (time.time() - start_time))

+------+----+-----------------------------+--------+
|Origin|Dest|sum(CAST(Distance AS DOUBLE))|count(1)|
+------+----+-----------------------------+--------+
|   ATL| GSP|                     107406.0|     702|
|   ORD| PDX|                    1721610.0|     990|
|   LAX| OXR|                      10241.0|     209|
|   LAS| LIT|                     147630.0|     114|
|   EWR| STT|                     117648.0|      72|
|   SFO| PMD|                      67940.0|     215|
|   MSP| AVL|                      44772.0|      52|
|   PHL| MCO|                    1841679.0|    2139|
|   CLE| SJU|                      16551.0|       9|
|   MCI| MKE|                        436.0|       1|
|   MCI| IAH|                     267488.0|     416|
|   SMF| BUR|                     297140.0|     830|
|   SNA| PHX|                     400868.0|    1186|
|   PBI| DCA|                     170543.0|     199|
|   ROC| CLE|                      78400.0|     320|
|   MLI| MCO|                      50764.0|   

In [13]:
# Start  the runtime
start_time = time.time()
# Filter the data on something that selects your partition choice.
spark.sql("""Select distinct Origin, TailNum from p_delays where TailNum='N712SW' """).show()
# Print out the runtime.
print("--- %s seconds ---" % (time.time() - start_time))

+------+-------+
|Origin|TailNum|
+------+-------+
|   MSY| N712SW|
|   RNO| N712SW|
|   RDU| N712SW|
|   MDW| N712SW|
|   IAD| N712SW|
|   ABQ| N712SW|
|   STL| N712SW|
|   MHT| N712SW|
|   BUR| N712SW|
|   PIT| N712SW|
|   MCO| N712SW|
|   JAX| N712SW|
|   HRL| N712SW|
|   TPA| N712SW|
|   CMH| N712SW|
|   BNA| N712SW|
|   IND| N712SW|
|   SNA| N712SW|
|   SMF| N712SW|
|   JAN| N712SW|
+------+-------+
only showing top 20 rows

--- 2.1195805072784424 seconds ---


In [14]:
# Start  the runtime
start_time = time.time()
# Filter the data on something that has nothing to do with your partition choice.
spark.sql("""Select distinct Dest, TailNum from p_delays_p where TailNum='N712SW' """).show()
# Print out the runtime.
print("--- %s seconds ---" % (time.time() - start_time))

+----+-------+
|Dest|TailNum|
+----+-------+
| MSY| N712SW|
| RNO| N712SW|
| RDU| N712SW|
| MDW| N712SW|
| IAD| N712SW|
| ABQ| N712SW|
| STL| N712SW|
| MHT| N712SW|
| BUR| N712SW|
| PIT| N712SW|
| MCO| N712SW|
| JAX| N712SW|
| HRL| N712SW|
| TPA| N712SW|
| CMH| N712SW|
| BNA| N712SW|
| IND| N712SW|
| SNA| N712SW|
| SMF| N712SW|
| JAN| N712SW|
+----+-------+
only showing top 20 rows

--- 3.8938913345336914 seconds ---
