# **Project Goal**

* Identify and segment NYC taxi zones into groups with similar demand, revenue and trip characteristics to understand mobility patterns.

**Purpose of this Notebook**
* This is a validation notebook to inspe

In [None]:
import pandas as pd
import geopandas as gdf
import os ,sys

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [12]:
path_taxi = '../../raw_data/tlc_trip_data/2023/yellow_tripdata_2023-01.parquet'
# path_taxi = "../data/raw/tlc_trip_data/2023/yellow_tripdata_2023-01.parquet"
path_zones_lookup = "../../raw_data/tlc_trip_data/taxi_zone_lookup.csv"
path_zone_shapefile = "../../raw_data/tlc_trip_data/taxi_zones/taxi_zones.shp"

In [4]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [7]:
spark = SparkSession.builder \
    .appName("TaxiZone-Data Loading") \
    .getOrCreate()

**Loading 2023 Yellow Taxi Trip Data**

In [13]:
df_taxi = spark.read.parquet(path_taxi).dropna()
df_taxi.show(5)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2023-01-01 00:32:10|  2023-01-01 00:40:36|            1.0|         0.97|       1.0|                 N|         161|         141|           2|        9.3|  1.0|    0.5|       0.

In [6]:
df_taxi.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



**Loading Taxi Zone Lookup Data**

In [7]:
df_lookup = spark.read.csv(path_zones_lookup, inferSchema=True)
df_lookup.show(5)

+----------+---------+--------------------+------------+
|       _c0|      _c1|                 _c2|         _c3|
+----------+---------+--------------------+------------+
|LocationID|  Borough|                Zone|service_zone|
|         1|      EWR|      Newark Airport|         EWR|
|         2|   Queens|         Jamaica Bay|   Boro Zone|
|         3|    Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|Manhattan|       Alphabet City| Yellow Zone|
+----------+---------+--------------------+------------+
only showing top 5 rows



In [8]:
rename_dict = {"_c0" : "LocationID","_c1" : "borough","_c2" : "zone", "_c3" : "service_zone"}
for x, y in rename_dict.items():
    df_lookup = df_lookup.withColumnRenamed(x, y)

df_lookup.show(5)

+----------+---------+--------------------+------------+
|LocationID|  borough|                zone|service_zone|
+----------+---------+--------------------+------------+
|LocationID|  Borough|                Zone|service_zone|
|         1|      EWR|      Newark Airport|         EWR|
|         2|   Queens|         Jamaica Bay|   Boro Zone|
|         3|    Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|Manhattan|       Alphabet City| Yellow Zone|
+----------+---------+--------------------+------------+
only showing top 5 rows



In [9]:
df_lookup.printSchema()

root
 |-- LocationID: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



**Loading Taxi Zone Shapefile Data**

In [10]:
df_zone = gdf.read_file(path_zone_shapefile).dropna()
df_zone.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   OBJECTID    263 non-null    int32   
 1   Shape_Leng  263 non-null    float64 
 2   Shape_Area  263 non-null    float64 
 3   zone        263 non-null    object  
 4   LocationID  263 non-null    int32   
 5   borough     263 non-null    object  
 6   geometry    263 non-null    geometry
dtypes: float64(2), geometry(1), int32(2), object(2)
memory usage: 12.5+ KB


In [11]:
df_zone.drop(columns='geometry', inplace=True)
df_zone.head()

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough
0,1,0.116357,0.000782,Newark Airport,1,EWR
1,2,0.43347,0.004866,Jamaica Bay,2,Queens
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx
3,4,0.043567,0.000112,Alphabet City,4,Manhattan
4,5,0.092146,0.000498,Arden Heights,5,Staten Island


In [12]:
spark_zones = spark.createDataFrame(df_zone)
spark_zones.printSchema()

root
 |-- OBJECTID: long (nullable = true)
 |-- Shape_Leng: double (nullable = true)
 |-- Shape_Area: double (nullable = true)
 |-- zone: string (nullable = true)
 |-- LocationID: long (nullable = true)
 |-- borough: string (nullable = true)



In [13]:
spark_zones.show(5)

+--------+---------------+----------------+--------------------+----------+-------------+
|OBJECTID|     Shape_Leng|      Shape_Area|                zone|LocationID|      borough|
+--------+---------------+----------------+--------------------+----------+-------------+
|       1| 0.116357453189|  7.823067885E-4|      Newark Airport|         1|          EWR|
|       2|  0.43346966679|0.00486634037837|         Jamaica Bay|         2|       Queens|
|       3|0.0843411059012|3.14414156821E-4|Allerton/Pelham G...|         3|        Bronx|
|       4|0.0435665270921|1.11871946192E-4|       Alphabet City|         4|    Manhattan|
|       5|0.0921464898574|4.97957489363E-4|       Arden Heights|         5|Staten Island|
+--------+---------------+----------------+--------------------+----------+-------------+
only showing top 5 rows



**Merging DataFrames spark_zones and df_lookup**

In [14]:
zones_info = spark_zones.join(df_lookup, on=["LocationID", "zone", "borough"], how="left").distinct().dropna()

In [15]:
zones_info.show(5)

+----------+--------------------+-------------+--------+---------------+----------------+------------+
|LocationID|                zone|      borough|OBJECTID|     Shape_Leng|      Shape_Area|service_zone|
+----------+--------------------+-------------+--------+---------------+----------------+------------+
|         8|        Astoria Park|       Queens|       8|0.0275906911574| 2.6587716279E-5|   Boro Zone|
|        16|             Bayside|       Queens|      16| 0.141291873771|8.71889446182E-4|   Boro Zone|
|         2|         Jamaica Bay|       Queens|       2|  0.43346966679|0.00486634037837|   Boro Zone|
|        15|Bay Terrace/Fort ...|       Queens|      15|  0.14433622262|9.25219395547E-4|   Boro Zone|
|         6|Arrochar/Fort Wad...|Staten Island|       6| 0.150490542523|6.06460984581E-4|   Boro Zone|
+----------+--------------------+-------------+--------+---------------+----------------+------------+
only showing top 5 rows



In [16]:
zones_info.printSchema()

root
 |-- LocationID: long (nullable = true)
 |-- zone: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- OBJECTID: long (nullable = true)
 |-- Shape_Leng: double (nullable = true)
 |-- Shape_Area: double (nullable = true)
 |-- service_zone: string (nullable = true)



**Merging zones_info with df_taxi**

In [18]:
df_joined = df_taxi.join(
    zones_info.select(
        col('LocationID').alias('PULocationID'),
        col('zone'),
        col('borough')
    ),
    on="PULocationID",
    how="left"
).dropna()
df_joined.printSchema()

root
 |-- PULocationID: long (nullable = true)
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- zone: string (nullable = true)
 |-- borough: string (nullable = true)



In [19]:
df_joined.show(5)

+------------+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+--------------+--------+
|PULocationID|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|          zone| borough|
+------------+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+--------------+--------+
|          26|       1| 2023-01-01 17:32:54|  2023-01-01 18:02:09|            1.0|          8.1|      99.0|        

In [20]:
core_df = df_joined.select(
    "VendorID",
    col("tpep_pickup_datetime").alias("pickup_time"),
    col("tpep_dropoff_datetime").alias("dropoff_time"),
    col("passenger_count").alias("P_count"),
    col("trip_distance"),
    col("fare_amount").alias("amt"),
    "PULocationID",
    "zone",
    "borough"
)
core_df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- pickup_time: timestamp_ntz (nullable = true)
 |-- dropoff_time: timestamp_ntz (nullable = true)
 |-- P_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- amt: double (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- zone: string (nullable = true)
 |-- borough: string (nullable = true)



In [21]:
core_df.show(5)

+--------+-------------------+-------------------+-------+-------------+----+------------+--------------+--------+
|VendorID|        pickup_time|       dropoff_time|P_count|trip_distance| amt|PULocationID|          zone| borough|
+--------+-------------------+-------------------+-------+-------------+----+------------+--------------+--------+
|       1|2023-01-01 17:32:54|2023-01-01 18:02:09|    1.0|          8.1|28.2|          26|  Borough Park|Brooklyn|
|       1|2023-01-02 12:12:02|2023-01-02 12:45:55|    1.0|         11.4|38.5|          26|  Borough Park|Brooklyn|
|       2|2023-01-02 13:23:49|2023-01-02 13:58:48|    2.0|         5.12|33.8|          26|  Borough Park|Brooklyn|
|       1|2023-01-02 14:57:10|2023-01-02 15:22:27|    1.0|          0.0|30.5|          26|  Borough Park|Brooklyn|
|       1|2023-01-03 07:40:30|2023-01-03 08:28:42|    1.0|         16.3|54.5|          29|Brighton Beach|Brooklyn|
+--------+-------------------+-------------------+-------+-------------+----+---