In [1]:
# Activate Spark in our Colab notebook.
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example: 'spark-3.2.2'
spark_version = 'spark-3.2.2'
# spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.2.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.2.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3.2"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (91.18                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:10 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:11 

In [2]:
# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [3]:
# Read in data from S3 Bucket
from pyspark import SparkFiles
url = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-netflix/DelayedFlights.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("DelayedFlights.csv"), sep=",", header=True)
df.show()

+---+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|_c0|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|
+---+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|  0|2008|    1|         3|        4| 2003.0|      1955| 2211.0|      2225|       

In [4]:
# Create a temporary view
df.createOrReplaceTempView('delays')

In [5]:
# Run a sql query that groups the data on UniqueCarrier
# note the time functions will track the time it takes to load and run the data
# we are only interested in the time it take to run so run this cell twice.
start_time = time.time()

spark.sql("""select UniqueCarrier,sum(CRSElapsedTime), count(*) from delays group by 1""").show()

print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-----------------------------------+--------+
|UniqueCarrier|sum(CAST(CRSElapsedTime AS DOUBLE))|count(1)|
+-------------+-----------------------------------+--------+
|           UA|                        2.4361525E7|  141426|
|           AA|                        3.2826567E7|  191865|
|           NW|                        1.1290359E7|   79108|
|           EV|                          8059597.0|   81877|
|           B6|                        1.0088034E7|   55315|
|           DL|                        1.8642864E7|  114238|
|           OO|                        1.2431166E7|  132433|
|           F9|                          4092210.0|   28269|
|           YV|                          6117806.0|   67063|
|           US|                        1.5893179E7|   98425|
|           AQ|                            99698.0|     750|
|           MQ|                        1.3302061E7|  141920|
|           OH|                          5970707.0|   52657|
|           HA|         

In [6]:
# Write out the data in parquet format
df.write.parquet('parquet_delayed',mode='overwrite')

In [7]:
# Read in our new parquet formatted data
p_df=spark.read.parquet('parquet_delayed')

In [8]:
# A parquet formatted dataframe has all the same methods as a row-based dataframe
# We can convert the dataframe to a view.
p_df.createOrReplaceTempView('p_delays')

In [9]:
start_time = time.time()

spark.sql("""select UniqueCarrier, count(*) from p_delays group by 1""").show()

print("--- %s seconds ---" % (time.time() - start_time))

+-------------+--------+
|UniqueCarrier|count(1)|
+-------------+--------+
|           UA|  141426|
|           AA|  191865|
|           NW|   79108|
|           EV|   81877|
|           B6|   55315|
|           DL|  114238|
|           OO|  132433|
|           F9|   28269|
|           YV|   67063|
|           US|   98425|
|           AQ|     750|
|           MQ|  141920|
|           OH|   52657|
|           HA|    7490|
|           XE|  103663|
|           AS|   39293|
|           FL|   71284|
|           CO|  100195|
|           WN|  377602|
|           9E|   51885|
+-------------+--------+

--- 3.2846519947052 seconds ---


In [10]:
# Here is another sample
start_time = time.time()

spark.sql("""select UniqueCarrier,sum(CRSElapsedTime), count(*) from p_delays group by 1""").show()

print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-----------------------------------+--------+
|UniqueCarrier|sum(CAST(CRSElapsedTime AS DOUBLE))|count(1)|
+-------------+-----------------------------------+--------+
|           UA|                        2.4361525E7|  141426|
|           AA|                        3.2826567E7|  191865|
|           NW|                        1.1290359E7|   79108|
|           EV|                          8059597.0|   81877|
|           B6|                        1.0088034E7|   55315|
|           DL|                        1.8642864E7|  114238|
|           OO|                        1.2431166E7|  132433|
|           F9|                          4092210.0|   28269|
|           YV|                          6117806.0|   67063|
|           US|                        1.5893179E7|   98425|
|           AQ|                            99698.0|     750|
|           MQ|                        1.3302061E7|  141920|
|           OH|                          5970707.0|   52657|
|           HA|         

In [11]:
# Partition our data by UniqueCarrier
df.write.partitionBy("UniqueCarrier").mode("overwrite").parquet("delayed_partitioned")

In [13]:
# Read in our new parquet formatted data
p_df_p=spark.read.parquet('delayed_partitioned')

In [14]:
# Convert the dataframe to a view.
p_df_p.createOrReplaceTempView('p_delays_p')

In [15]:
# Query the partitioned data on the Partition key.
start_time = time.time()

spark.sql("""select UniqueCarrier, count(*) from p_delays_p group by 1""").show()

print("--- %s seconds ---" % (time.time() - start_time))

+-------------+--------+
|UniqueCarrier|count(1)|
+-------------+--------+
|           UA|  141426|
|           AA|  191865|
|           NW|   79108|
|           EV|   81877|
|           B6|   55315|
|           DL|  114238|
|           OO|  132433|
|           F9|   28269|
|           YV|   67063|
|           US|   98425|
|           AQ|     750|
|           MQ|  141920|
|           OH|   52657|
|           HA|    7490|
|           XE|  103663|
|           AS|   39293|
|           CO|  100195|
|           FL|   71284|
|           WN|  377602|
|           9E|   51885|
+-------------+--------+

--- 2.242203712463379 seconds ---


In [16]:
# Grouping by partition key and aggregating data.
start_time = time.time()

spark.sql("""select UniqueCarrier,sum(CRSElapsedTime) from p_delays_p group by 1""").show()

print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-----------------------------------+
|UniqueCarrier|sum(CAST(CRSElapsedTime AS DOUBLE))|
+-------------+-----------------------------------+
|           UA|                        2.4361525E7|
|           AA|                        3.2826567E7|
|           NW|                        1.1290359E7|
|           EV|                          8059597.0|
|           B6|                        1.0088034E7|
|           DL|                        1.8642864E7|
|           OO|                        1.2431166E7|
|           F9|                          4092210.0|
|           YV|                          6117806.0|
|           US|                        1.5893179E7|
|           AQ|                            99698.0|
|           MQ|                        1.3302061E7|
|           OH|                          5970707.0|
|           HA|                           970794.0|
|           XE|                        1.2300491E7|
|           AS|                          6005931.0|
|           

In [17]:
# Another query filtering on the key.
start_time = time.time()
spark.sql("""Select UniqueCarrier, sum(DepDelay) as total_delayed from p_delays_p where UniqueCarrier='US' group by 1""").show()
print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------------+
|UniqueCarrier|total_delayed|
+-------------+-------------+
|           US|    3819499.0|
+-------------+-------------+

--- 1.5817921161651611 seconds ---


In [18]:
# Same query as above against the parquet (non-partitioned) data.
start_time = time.time()
spark.sql("""Select UniqueCarrier, sum(DepDelay) as total_delayed from p_delays where UniqueCarrier='US' group by 1""").show()
print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------------+
|UniqueCarrier|total_delayed|
+-------------+-------------+
|           US|    3819499.0|
+-------------+-------------+

--- 1.9894812107086182 seconds ---


In [19]:
#Here is a query that doesn't use the partition key at all (against the parquet data)
start_time = time.time()
spark.sql("""Select distinct UniqueCarrier, TailNum from p_delays where TailNum='N712SW' """).show()
print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------+
|UniqueCarrier|TailNum|
+-------------+-------+
|           WN| N712SW|
+-------------+-------+

--- 1.6174707412719727 seconds ---


In [20]:
#Here is a query that doesn't use the partition key at all (agains the partitioned parquet data)
start_time = time.time()
spark.sql("""Select distinct UniqueCarrier, TailNum from p_delays_p where TailNum='N712SW' """).show()
print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------+
|UniqueCarrier|TailNum|
+-------------+-------+
|           WN| N712SW|
+-------------+-------+

--- 1.7516300678253174 seconds ---
