In [1]:
# Activate Spark in our Colab notebook.
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example: 'spark-3.2.2'
spark_version = 'spark-3.2.2'
# spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.2.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.2.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3.2"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com (185.125.190.39)] [1 InRelease 14.2 kB/88.                                                                               Hit:2 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
0% [Waiting for headers] [1 InRelease 88.7 kB/88.7 kB 100%] [Connected to cloud                                                                               Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
                                                                               Get:4 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
                                                                               Hit:5 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
                                                                               Hit:6 https://developer.download.nvidia.com/compute/cuda/r

In [2]:
# Import packages
from pyspark.sql import SparkSession
# Import the time module so we can time our queries.
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [3]:
# Read in data from S3 Bucket
from pyspark import SparkFiles
url = "https://2u-data-curriculum-team.s3.amazonaws.com/nflx-data-science-adv/week-5/NYC_Building_Violations.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("NYC_Building_Violations.csv"), sep=",", header=True)
df.show()

+----------------+----+-------+-----+-----+----------+-------------------+----------------+------------+--------------------+----------------+--------------------+-------------+--------------------+----------+--------------------+--------------------+--------------------+
|ISN_DOB_BIS_VIOL|BORO|    BIN|BLOCK|  LOT|ISSUE_DATE|VIOLATION_TYPE_CODE|VIOLATION_NUMBER|HOUSE_NUMBER|              STREET|DISPOSITION_DATE|DISPOSITION_COMMENTS|DEVICE_NUMBER|         DESCRIPTION|ECB_NUMBER|              NUMBER|  VIOLATION_CATEGORY|      VIOLATION_TYPE|
+----------------+----+-------+-----+-----+----------+-------------------+----------------+------------+--------------------+----------------+--------------------+-------------+--------------------+----------+--------------------+--------------------+--------------------+
|         2286033|   1|1009713|00577|00019|  20180507|                  E|     9027/627971|          34|        WEST 14TH ST|        20220509|PPN203 AOC SUB 05...|      1P13420|    

In [4]:
# Get a summary of the data. 
df.summary().show()

+-------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+----------------+------------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+
|summary|  ISN_DOB_BIS_VIOL|              BORO|               BIN|             BLOCK|               LOT|         ISSUE_DATE|VIOLATION_TYPE_CODE|VIOLATION_NUMBER|      HOUSE_NUMBER|            STREET|    DISPOSITION_DATE|DISPOSITION_COMMENTS|       DEVICE_NUMBER|         DESCRIPTION|          ECB_NUMBER|            NUMBER|  VIOLATION_CATEGORY|      VIOLATION_TYPE|
+-------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+----------------+------------------+------------------+--------------------+--------------------+--------------------+------

In [6]:
 # Let's create a view with our DataFrame and run SQL that will sum up the boroughs by the type of violation.
# We can output the time this step runs in seconds.
# Because we are timing the executions, remember to run twice to eliminate the "load time" from the discussion.

df.createOrReplaceTempView('violations')
start_time = time.time()

spark.sql("""select VIOLATION_TYPE, sum(BORO) from violations group by 1""").show()

print("--- %s seconds ---" % (time.time() - start_time))

+--------------------+-------------------------+
|      VIOLATION_TYPE|sum(CAST(BORO AS DOUBLE))|
+--------------------+-------------------------+
|LL10/80-LOCAL LAW...|                   3609.0|
|LL11/98-LOCAL LAW...|                   9285.0|
|HVIOS-NYCHA ELEV ...|                    969.0|
|P-PLUMBING       ...|                  29480.0|
|ACH1-(NYCHA) - EL...|                   4949.0|
|LANDMRK-LANDMARK ...|                   5599.0|
|LL5-LOCAL LAW 5/7...|                   1363.0|
|IMD-IMMEDIATE EME...|                     13.0|
|B-BOILER         ...|                  17042.0|
|FISP-FACADE SAFET...|                   6889.0|
|EGNCY-EMERGENCY  ...|                  12607.0|
|ES-ELECTRIC SIGNS...|                  18378.0|
|                null|                    148.0|
|L1198-LOCAL LAW 1...|                  10656.0|
|HBLVIO-HIGH PRESS...|                  14628.0|
|BENCH-FAILURE TO ...|                 110285.0|
|RWNRF-RETAINING W...|                   4007.0|
|FISPNRF-NO REPORT..

In [7]:
# Write out the data in parquet format
# Note: That this is pretty much the same as writing out to a csv.  
# We are telling Spark to overwrite all of the data if it already exists
df.write.parquet('parquet_violations',mode='overwrite')



*   click the folder icon on the left of the notebook to expose the folders and files stored in your colab enviornment.  Notice that a new folder is present with the same name as your parquet file (parquet_title_basic)
*   inside of it you will find 'part-*.parquet' files and a '_SUCCESS' file. 
*  The '_SUCCESS' file is created when Spark creates a Parquet folder
*  the part-* files are binary files that store your compressed data in columnar format





In [8]:
# Read in our new parquet formatted data
p_df=spark.read.parquet('parquet_violations')

In [9]:
# A parquet formatted DataFrame has all the same methods as a row-based DataFrame
# We can convert the DataFrame to a view.
p_df.createOrReplaceTempView('p_violations')

In [10]:
# Run the same sql as above.  (Note: If you have small datasets it IS possible that times may be very close.)
# Because we are timing the executions, remember to run twice to eliminate the "load time" from the discussion.

start_time = time.time()
spark.sql("""select VIOLATION_TYPE, sum(BORO) from p_violations group by 1""").show()
print("--- %s seconds ---" % (time.time() - start_time))

+--------------------+-------------------------+
|      VIOLATION_TYPE|sum(CAST(BORO AS DOUBLE))|
+--------------------+-------------------------+
|LL10/80-LOCAL LAW...|                   3609.0|
|LL11/98-LOCAL LAW...|                   9285.0|
|HVIOS-NYCHA ELEV ...|                    969.0|
|P-PLUMBING       ...|                  29480.0|
|ACH1-(NYCHA) - EL...|                   4949.0|
|LANDMRK-LANDMARK ...|                   5599.0|
|LL5-LOCAL LAW 5/7...|                   1363.0|
|IMD-IMMEDIATE EME...|                     13.0|
|B-BOILER         ...|                  17042.0|
|FISP-FACADE SAFET...|                   6889.0|
|EGNCY-EMERGENCY  ...|                  12607.0|
|ES-ELECTRIC SIGNS...|                  18378.0|
|                null|                    148.0|
|L1198-LOCAL LAW 1...|                  10656.0|
|HBLVIO-HIGH PRESS...|                  14628.0|
|BENCH-FAILURE TO ...|                 110285.0|
|RWNRF-RETAINING W...|                   4007.0|
|FISPNRF-NO REPORT..

In [11]:
# Writing out a csv file from Spark will also create a folder with "part" files.
# These files are not binary or compressed and in reality are just normal csv files broken into partitions.
# You can see the folder 'out_violations.csv' in the file explorer to the left of the notebook.
df.write.csv('out_violations.csv')