In [23]:
import findspark
findspark.init()

import pyspark  
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, TimestampType
import os
import pyspark.sql.functions as F
import json
import glob

credentials = json.load(open("../credentials/credentials.json"))
spark = SparkSession\
    .builder\
    .master("spark://localhost:7077")\
    .appName('HelloWorld')\
    .config("spark.driver.memory", "6G") \
    .config("spark.executor.memory", "6G") \
    .config("spark.driver.maxResultSize", "6G") \
    .config("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")\
    .config(f'fs.azure.account.key.{credentials["storage_account_name"]}.blob.core.windows.net',credentials["storage_account_key"])\
    .getOrCreate()


### Spark Version

In [47]:
spark
# print("Spark Version: ", spark.version)

### Loading and downloading HVFHW data 2021 file.
Download is managed by prefect orion client.

~~`prefect orion start`~~

~~`prefect agent start 'default'` ~~

~~Deployment `etl-local-flow` is run with parameters `{"year":[2021],"month":[6]},"color":["fhvhv"]}`~~

**NOTE: The data from the CSV coming from Github repository is different from the website**

In [24]:
df_fvhv = spark.read.csv("../resources/datasets/fhvhv/2021/06",header=True, inferSchema=True,)
df_fvhv.printSchema()


root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- dropoff_datetime: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



### Changing schema

In [25]:
df_fvhv_schema = \
    StructType(
        [
            StructField("dispatching_base_num",StringType(),True),
            StructField("pickup_datetime",TimestampType(),True),
            StructField("dropoff_datetime",TimestampType(),True),
            StructField("PULocationID",IntegerType(),True),
            StructField("DOLocationID",IntegerType(),True),
            StructField("SR_Flag",StringType(),True),
            StructField("Affiliated_base_number",StringType(),True)
        ])
df_fvhv = spark.read.csv("../resources/datasets/fhvhv/2021/06",header=True, schema=df_fvhv_schema)
df_fvhv.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [26]:
df_fvhv.select(["pickup_datetime","dropoff_datetime"]).show(5)

+-------------------+-------------------+
|    pickup_datetime|   dropoff_datetime|
+-------------------+-------------------+
|2021-06-01 00:02:41|2021-06-01 00:07:46|
|2021-06-01 00:16:16|2021-06-01 00:21:14|
|2021-06-01 00:27:01|2021-06-01 00:42:11|
|2021-06-01 00:46:08|2021-06-01 00:53:45|
|2021-06-01 00:45:42|2021-06-01 01:03:33|
+-------------------+-------------------+
only showing top 5 rows



### Question 2

Repartition into 12 partitions and save to parquet.

In [28]:
file_loc = "../resources/datasets/fhvhv_2021_06_partitioned"

df_fvhv.repartition(12).write.option("header", "true").parquet(f"{file_loc}")

parquet_files = glob.glob(os.path.join(file_loc, "*.parquet"))

total_size = 0
for file_path in parquet_files:
    total_size += os.path.getsize(file_path)
print(f"Average file size is: {total_size/len(parquet_files)} bytes")

Average file size is: 23066452.333333332 bytes


### Question 3
How many taxi trips were there on June 15?</br>
Consider only trips that started on June 15.</br>

In [34]:
taxi_trips_count = df_fvhv.where("DAY(pickup_datetime) == 15").count()
print(f"Number of taxi trips on 15th June 2021: {taxi_trips_count}")

Number of taxi trips on 15th June 2021: 452470


### Question 4
Now calculate the duration for each trip.</br>
How long was the longest trip in Hours?</br>

In [44]:
df_fvhv.createOrReplaceTempView("df_fvhv")
trip_durations = spark.sql("""
    SELECT 
        CAST(dropoff_datetime - pickup_datetime AS INTERVAL HOUR) AS trip_duration_hours
    FROM
        df_fvhv
    ORDER BY
        trip_duration_hours DESC
"""
)
trip_durations.show(5)



+-------------------+
|trip_duration_hours|
+-------------------+
| INTERVAL '66' HOUR|
| INTERVAL '25' HOUR|
| INTERVAL '19' HOUR|
| INTERVAL '18' HOUR|
| INTERVAL '16' HOUR|
+-------------------+
only showing top 5 rows



In [None]:

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)

### Question 5
Load the zone lookup data into a temp view in Spark

In [49]:
taxi_zone_lookpup = spark.read.csv("../resources/datasets/taxi+_zone_lookup.csv",header=True, inferSchema=True)
taxi_zone_lookpup.show(5)
taxi_zone_lookpup.createOrReplaceTempView("taxi_zone_lookpup")

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
+----------+-------------+--------------------+------------+
only showing top 5 rows



In [55]:
df_fvhv_full = spark.sql("""
    SELECT
        df_fvhv.dispatching_base_num,
        df_fvhv.pickup_datetime,
        df_fvhv.dropoff_datetime,
        pickup_zone.Borough AS pickup_borough,
        pickup_zone.Zone AS pickup_zone,
        dropoff_zone.Borough AS dropoff_borough,
        dropoff_zone.Zone AS dropoff_zone,
        df_fvhv.SR_Flag,
        df_fvhv.Affiliated_base_number
    FROM df_fvhv
    INNER JOIN taxi_zone_lookpup AS pickup_zone
        ON pickup_zone.LocationID = df_fvhv.PULocationID
    INNER JOIN taxi_zone_lookpup AS dropoff_zone
        ON dropoff_zone.LocationID = df_fvhv.DOLocationID
""")   

df_fvhv_full.printSchema()
df_fvhv_full.createOrReplaceTempView("df_fvhv_full")

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- pickup_borough: string (nullable = true)
 |-- pickup_zone: string (nullable = true)
 |-- dropoff_borough: string (nullable = true)
 |-- dropoff_zone: string (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



Using the zone lookup data and the fhvhv June 2021 data, what is the name of the most frequent pickup location zone?

In [57]:
spark.sql("""
    SELECT 
        pickup_zone, 
        COUNT(pickup_zone)
    FROM 
        df_fvhv_full
    GROUP BY pickup_zone
    ORDER BY COUNT(pickup_zone) DESC
""").show(5)



+-------------------+------------------+
|        pickup_zone|count(pickup_zone)|
+-------------------+------------------+
|Crown Heights North|            231279|
|       East Village|            221244|
|        JFK Airport|            188867|
|     Bushwick South|            187929|
|      East New York|            186780|
+-------------------+------------------+
only showing top 5 rows

