In [99]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types
from pyspark.sql import functions as F 
from pyspark.sql.functions import col
import pandas as pd

### Question 1:

In [4]:
pyspark.__version__

'3.3.1'

### Question 2:

Create a spark session with all available cores.

In [5]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/26 22:10:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Get June 2021 FHVHV data.

In [11]:
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-06.csv.gz

--2023-02-26 21:54:43--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-06.csv.gz
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/4564ad9e-a6da-4923-ad6f-35ff02446a51?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230226%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230226T215443Z&X-Amz-Expires=300&X-Amz-Signature=e81e5f419666278011072bda474ab36079a16fc90ffb237dbf8f54b95c898c17&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dfhvhv_tripdata_2021-06.csv.gz&response-content-type=application%2Foctet-stream [following]
--2023-02-26 21:54:43--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/4564ad9e

Unzip data.

In [12]:
!gzip -d fhvhv_tripdata_2021-06.csv.gz

Check record count.

In [6]:
!wc -l fhvhv_tripdata_2021-06.csv

14961893 fhvhv_tripdata_2021-06.csv


Set data frame.

In [22]:
df = spark.read \
    .option("header", "true") \
    .csv('fhvhv_tripdata_2021-06.csv')

In [53]:
df_cleaned = df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropoff_datetime)) \
    .withColumn('pickup_datetime', F.to_timestamp(df.pickup_datetime)) \
    .withColumn('dropoff_datetime', F.to_timestamp(df.dropoff_datetime))

In [54]:
df_cleaned.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)
 |-- pickup_date: date (nullable = true)
 |-- dropoff_date: date (nullable = true)



In [55]:
df_cleaned.head(5)

[Row(dispatching_base_num='B02764', pickup_datetime=datetime.datetime(2021, 6, 1, 0, 2, 41), dropoff_datetime=datetime.datetime(2021, 6, 1, 0, 7, 46), PULocationID='174', DOLocationID='18', SR_Flag='N', Affiliated_base_number='B02764', pickup_date=datetime.date(2021, 6, 1), dropoff_date=datetime.date(2021, 6, 1)),
 Row(dispatching_base_num='B02764', pickup_datetime=datetime.datetime(2021, 6, 1, 0, 16, 16), dropoff_datetime=datetime.datetime(2021, 6, 1, 0, 21, 14), PULocationID='32', DOLocationID='254', SR_Flag='N', Affiliated_base_number='B02764', pickup_date=datetime.date(2021, 6, 1), dropoff_date=datetime.date(2021, 6, 1)),
 Row(dispatching_base_num='B02764', pickup_datetime=datetime.datetime(2021, 6, 1, 0, 27, 1), dropoff_datetime=datetime.datetime(2021, 6, 1, 0, 42, 11), PULocationID='240', DOLocationID='127', SR_Flag='N', Affiliated_base_number='B02764', pickup_date=datetime.date(2021, 6, 1), dropoff_date=datetime.date(2021, 6, 1)),
 Row(dispatching_base_num='B02764', pickup_datet

Repartition to 24 partitions.

In [56]:
df_cleaned.repartition(24)

DataFrame[dispatching_base_num: string, pickup_datetime: timestamp, dropoff_datetime: timestamp, PULocationID: string, DOLocationID: string, SR_Flag: string, Affiliated_base_number: string, pickup_date: date, dropoff_date: date]

Save to parquet.

In [57]:
df_cleaned.write.parquet('fhvhv/2021/06/', mode='overwrite')

                                                                                

### Question 3

Find number of trips that started on June 15th:

In [58]:
df_fhvhv = spark.read.parquet('fhvhv/2021/06/')

In [59]:
df_fhvhv.columns

['dispatching_base_num',
 'pickup_datetime',
 'dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'SR_Flag',
 'Affiliated_base_number',
 'pickup_date',
 'dropoff_date']

In [60]:
df_fhvhv.head(5)

[Row(dispatching_base_num='B02877', pickup_datetime=datetime.datetime(2021, 6, 13, 21, 48, 44), dropoff_datetime=datetime.datetime(2021, 6, 13, 22, 24, 7), PULocationID='234', DOLocationID='216', SR_Flag='N', Affiliated_base_number='B02877', pickup_date=datetime.date(2021, 6, 13), dropoff_date=datetime.date(2021, 6, 13)),
 Row(dispatching_base_num='B02510', pickup_datetime=datetime.datetime(2021, 6, 13, 21, 0, 43), dropoff_datetime=datetime.datetime(2021, 6, 13, 21, 25, 48), PULocationID='109', DOLocationID='206', SR_Flag='N', Affiliated_base_number=None, pickup_date=datetime.date(2021, 6, 13), dropoff_date=datetime.date(2021, 6, 13)),
 Row(dispatching_base_num='B02510', pickup_datetime=datetime.datetime(2021, 6, 13, 21, 28, 39), dropoff_datetime=datetime.datetime(2021, 6, 13, 21, 36, 13), PULocationID='206', DOLocationID='23', SR_Flag='N', Affiliated_base_number=None, pickup_date=datetime.date(2021, 6, 13), dropoff_date=datetime.date(2021, 6, 13)),
 Row(dispatching_base_num='B02510', 

In [61]:
df_fhvhv.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)
 |-- pickup_date: date (nullable = true)
 |-- dropoff_date: date (nullable = true)



In [62]:
df_fhvhv.registerTempTable('fhvhv_2021_06')



In [63]:
spark.sql("""
SELECT  count(*)
FROM    fhvhv_2021_06 
WHERE   pickup_date = '2021-06-15'
;
""").show()

+--------+
|count(1)|
+--------+
|  452470|
+--------+



### Question 4

Which day had the longest trip starting on it?

In [73]:
spark.sql("""
SELECT  pickup_date,
        (unix_timestamp(dropoff_datetime)-unix_timestamp(pickup_datetime))/60
FROM    fhvhv_2021_06 
GROUP   BY 1,2
ORDER   BY 2 DESC
;
""").show()



+-----------+---------------------------------------------------------------------------------------------------------------------+
|pickup_date|((unix_timestamp(dropoff_datetime, yyyy-MM-dd HH:mm:ss) - unix_timestamp(pickup_datetime, yyyy-MM-dd HH:mm:ss)) / 60)|
+-----------+---------------------------------------------------------------------------------------------------------------------+
| 2021-06-25|                                                                                                    4012.733333333333|
| 2021-06-22|                                                                                                   1532.9833333333333|
| 2021-06-27|                                                                                                              1198.85|
| 2021-06-26|                                                                                                   1091.8333333333333|
| 2021-06-23|                                                               

                                                                                

### Question 5

Find the most common dispatching_base_num

In [76]:
spark.sql("""
SELECT  distinct(dispatching_base_num),
        count(dispatching_base_num)
FROM    fhvhv_2021_06 
GROUP   BY 1
ORDER   BY 2 DESC
;
""").show()



+--------------------+---------------------------+
|dispatching_base_num|count(dispatching_base_num)|
+--------------------+---------------------------+
|              B02510|                    4126325|
|              B02764|                    1355081|
|              B02872|                    1153791|
|              B02875|                     934990|
|              B02765|                     697992|
|              B02869|                     542710|
|              B02866|                     467799|
|              B02887|                     410821|
|              B02682|                     409345|
|              B02871|                     374807|
|              B02864|                     359982|
|              B02878|                     356478|
|              B02617|                     327372|
|              B02883|                     316249|
|              B02884|                     309708|
|              B02882|                     295276|
|              B02876|         

                                                                                

### Question 6

Find the most common pickup-dropoff pair.

In [77]:
df_zones = spark.read.parquet('zones/')

In [78]:
df_zones.columns

['LocationID', 'Borough', 'Zone', 'service_zone']

In [79]:
df_zones.head(5)

[Row(LocationID='1', Borough='EWR', Zone='Newark Airport', service_zone='EWR'),
 Row(LocationID='2', Borough='Queens', Zone='Jamaica Bay', service_zone='Boro Zone'),
 Row(LocationID='3', Borough='Bronx', Zone='Allerton/Pelham Gardens', service_zone='Boro Zone'),
 Row(LocationID='4', Borough='Manhattan', Zone='Alphabet City', service_zone='Yellow Zone'),
 Row(LocationID='5', Borough='Staten Island', Zone='Arden Heights', service_zone='Boro Zone')]

In [80]:
df_zones.printSchema()

root
 |-- LocationID: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



In [119]:
df_zones_pu = df_zones
df_zones_do = df_zones

In [121]:
df_result = df_fhvhv.alias("fhv") \
    .join(df_zones_pu.withColumnRenamed('Zone', 'PU_Zone').alias("pu"), col("fhv.PULocationID") == col("pu.LocationID")) \
    .join(df_zones_do.withColumnRenamed('Zone', 'DO_Zone').alias("do"), col("fhv.DOLocationID") == col("do.LocationID"))

In [117]:
df_result.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)
 |-- pickup_date: date (nullable = true)
 |-- dropoff_date: date (nullable = true)
 |-- LocationID: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- PU_Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)
 |-- LocationID: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



In [122]:
df_result.registerTempTable('fhvhv_2021_06_result')

In [132]:
spark.sql("""
SELECT  DISTINCT(CONCAT(PU_Zone, "/", DO_Zone)) AS PU_DO_Zone,
        COUNT(dispatching_base_num) AS Count
FROM    fhvhv_2021_06_result 
GROUP   BY 1
ORDER   BY 2 DESC
;
""").show(5,truncate=False)



+---------------------------------------+-----+
|PU_DO_Zone                             |Count|
+---------------------------------------+-----+
|East New York/East New York            |47926|
|JFK Airport/NA                         |31321|
|Canarsie/Canarsie                      |28230|
|Crown Heights North/Crown Heights North|25216|
|Borough Park/Borough Park              |24778|
+---------------------------------------+-----+
only showing top 5 rows



                                                                                