In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
# Create local spark session
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("fhvhv") \
        .getOrCreate()

In [49]:
spark.version

'3.2.1'

In [4]:
df = spark.read \
    .option("header", "true") \
    .parquet("data/raw/fhvhv/*/*")
df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp (nullable = true)
 |-- on_scene_datetime: timestamp (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_ride_flag: string (nul

In [5]:
df = df.repartition(24)

In [6]:
df.write.parquet("data/pq/fhvhv/2021/02/")

In [7]:
df_fhvhv = spark.read.parquet("data/pq/fhvhv/*/*/")

In [9]:
df_fhvhv.columns

['hvfhs_license_num',
 'dispatching_base_num',
 'originating_base_num',
 'request_datetime',
 'on_scene_datetime',
 'pickup_datetime',
 'dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'trip_miles',
 'trip_time',
 'base_passenger_fare',
 'tolls',
 'bcf',
 'sales_tax',
 'congestion_surcharge',
 'airport_fee',
 'tips',
 'driver_pay',
 'shared_request_flag',
 'shared_match_flag',
 'access_a_ride_flag',
 'wav_request_flag',
 'wav_match_flag']

In [8]:
df_fhvhv.createOrReplaceTempView("fhvhv_data")

In [32]:
spark.sql("""
SELECT
    dispatching_base_num,
    dropoff_datetime,
    pickup_datetime,
    trip_time,
    (to_timestamp(dropoff_datetime) - to_timestamp(pickup_datetime)) AS trip_duration
FROM 
    fhvhv_data
LIMIT 10
""").show()

+--------------------+-------------------+-------------------+---------+--------------------+
|dispatching_base_num|   dropoff_datetime|    pickup_datetime|trip_time|       trip_duration|
+--------------------+-------------------+-------------------+---------+--------------------+
|              B02882|2021-02-15 00:07:37|2021-02-14 23:41:50|     1547|INTERVAL '0 00:25...|
|              B02871|2021-02-20 18:42:07|2021-02-20 18:20:32|     1295|INTERVAL '0 00:21...|
|              B02882|2021-02-27 21:58:41|2021-02-27 21:44:02|      879|INTERVAL '0 00:14...|
|              B02887|2021-02-06 15:18:50|2021-02-06 15:10:36|      494|INTERVAL '0 00:08...|
|              B02864|2021-02-27 08:54:41|2021-02-27 08:39:02|      939|INTERVAL '0 00:15...|
|              B02875|2021-02-05 09:05:10|2021-02-05 08:50:44|      866|INTERVAL '0 00:14...|
|              B02765|2021-02-24 21:14:18|2021-02-24 20:56:55|     1043|INTERVAL '0 00:17...|
|              B02764|2021-02-28 12:45:26|2021-02-28 12:33:3

In [18]:
sql_query = """
SELECT 
    COUNT(*) AS feb_15_trips
FROM
    fhvhv_data
WHERE 
    TO_DATE(pickup_datetime) == '2021-02-15'
"""
spark.sql(sql_query).show()

+------------+
|feb_15_trips|
+------------+
|      392133|
+------------+



In [39]:
from datetime import datetime

dropoff_datetime = datetime.strptime("2021-02-15 00:07:37", "%Y-%m-%d %H:%M:%S")
pickup_datetime = datetime.strptime("2021-02-14 23:41:50", "%Y-%m-%d %H:%M:%S")
diff = dropoff_datetime - pickup_datetime
print(round(diff.total_seconds()))

1547


In [41]:
spark.sql("""
SELECT
    dropoff_datetime,
    pickup_datetime,
    trip_time,
    (to_timestamp(dropoff_datetime) - to_timestamp(pickup_datetime)) AS trip_duration
FROM 
    fhvhv_data
ORDER BY 4 DESC
LIMIT 10
""").show()

+-------------------+-------------------+---------+--------------------+
|   dropoff_datetime|    pickup_datetime|trip_time|       trip_duration|
+-------------------+-------------------+---------+--------------------+
|2021-02-12 13:39:44|2021-02-11 16:40:44|    75540|INTERVAL '0 20:59...|
|2021-02-18 10:48:34|2021-02-17 18:54:53|    57220|INTERVAL '0 15:53...|
|2021-02-21 03:22:14|2021-02-20 15:08:15|    44038|INTERVAL '0 12:13...|
|2021-02-04 10:41:58|2021-02-03 23:24:25|    40653|INTERVAL '0 11:17...|
|2021-02-20 12:44:01|2021-02-20 02:17:44|    37577|INTERVAL '0 10:26...|
|2021-02-26 05:57:05|2021-02-25 20:13:35|    35010|INTERVAL '0 09:43...|
|2021-02-20 14:16:19|2021-02-20 04:36:13|    34806|INTERVAL '0 09:40...|
|2021-02-19 04:01:11|2021-02-18 18:24:19|    34612|INTERVAL '0 09:36...|
|2021-02-18 14:07:15|2021-02-18 04:31:20|    34555|INTERVAL '0 09:35...|
|2021-02-11 09:21:08|2021-02-10 23:51:39|        0|INTERVAL '0 09:29...|
+-------------------+-------------------+---------+

In [47]:
spark.sql("""
SELECT
    to_date(pickup_datetime) AS pickup_date,
    MAX((CAST(dropoff_datetime AS LONG) - CAST(pickup_datetime AS LONG)) / 60) AS duration
FROM 
    fhvhv_data
GROUP BY
    1
ORDER BY
    2 DESC
LIMIT 10;
""").show()

+-----------+-----------------+
|pickup_date|         duration|
+-----------+-----------------+
| 2021-02-11|           1259.0|
| 2021-02-17|953.6833333333333|
| 2021-02-20|733.9833333333333|
| 2021-02-03|           677.55|
| 2021-02-25|            583.5|
| 2021-02-18|576.8666666666667|
| 2021-02-10|569.4833333333333|
| 2021-02-21|           537.05|
| 2021-02-09|534.7833333333333|
| 2021-02-06|524.1166666666667|
+-----------+-----------------+



In [42]:
df_zones = spark.read.parquet('zones/')

In [45]:
df_zones.createOrReplaceTempView('zones')

In [46]:
spark.sql("""
SELECT
    CONCAT(pul.Zone, ' / ', dol.Zone) AS pu_do_pair,
    COUNT(1)
FROM 
    fhvhv_data fhv LEFT JOIN zones pul ON fhv.PULocationID = pul.LocationID
                      LEFT JOIN zones dol ON fhv.DOLocationID = dol.LocationID
GROUP BY 
    1
ORDER BY
    2 DESC
LIMIT 10;
""").show()

+--------------------+--------+
|          pu_do_pair|count(1)|
+--------------------+--------+
|East New York / E...|   45041|
|Borough Park / Bo...|   37329|
| Canarsie / Canarsie|   28026|
|Crown Heights Nor...|   25976|
|Bay Ridge / Bay R...|   17934|
|Jackson Heights /...|   14688|
|   Astoria / Astoria|   14688|
|Central Harlem No...|   14481|
|Bushwick South / ...|   14424|
|Flatbush/Ditmas P...|   13976|
+--------------------+--------+

