In [2]:
import pandas as pd
from pathlib import Path

In [3]:
ROOT = Path("..")
clean_dir = ROOT / "data" / "processed" / "yellow_cleaned_parquet"
zones = pd.read_csv(ROOT / "data" / "raw" / "taxi_zone_lookup.csv")

In [4]:
df = pd.read_parquet(clean_dir)

In [5]:
# Join pickup zone names
df = df.merge(zones, how="left", left_on="PULocationID", right_on="LocationID")
df.rename(columns={"Borough":"PU_Borough","Zone":"PU_Zone","service_zone":"PU_service_zone"}, inplace=True)
df.drop(columns=["LocationID"], inplace=True)

df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,Airport_fee,flag_unknown_payment_type,flag_long_duration,flag_extreme_distance,flag_high_passenger_count,pickup_year,pickup_month,PU_Borough,PU_Zone,PU_service_zone
0,2,2002-12-31 22:59:39,2002-12-31 23:05:41,1.0,0.63,1.0,N,170,170,3,...,0.0,False,False,False,False,2002,12,Manhattan,Murray Hill,Yellow Zone
1,2,2002-12-31 22:59:39,2002-12-31 23:05:41,1.0,0.63,1.0,N,170,170,3,...,0.0,False,False,False,False,2002,12,Manhattan,Murray Hill,Yellow Zone
2,2,2002-12-31 22:59:39,2002-12-31 23:05:41,1.0,0.63,1.0,N,170,170,3,...,0.0,False,False,False,False,2002,12,Manhattan,Murray Hill,Yellow Zone
3,2,2009-01-01 23:58:40,2009-01-02 00:01:40,1.0,0.46,1.0,N,137,264,2,...,0.0,False,False,False,False,2009,1,Manhattan,Kips Bay,Yellow Zone
4,2,2009-01-01 23:30:39,2009-01-02 00:01:39,1.0,10.99,1.0,N,237,264,2,...,0.0,False,False,False,False,2009,1,Manhattan,Upper East Side South,Yellow Zone


In [6]:
df["pickup_hour"] = df["tpep_pickup_datetime"].dt.hour
df["pickup_dow"] = df["tpep_pickup_datetime"].dt.day_name()
df["pickup_date"] = df["tpep_pickup_datetime"].dt.date

In [7]:
agg_hour = (
    df.groupby("pickup_hour")
      .agg(trips=("VendorID","size"),
           avg_fare=("fare_amount","mean"),
           avg_distance=("trip_distance","mean"))
      .reset_index()
      .sort_values("pickup_hour")
)
agg_hour.head()

Unnamed: 0,pickup_hour,trips,avg_fare,avg_distance
0,0,300979,19.692419,3.854704
1,1,201948,17.733322,3.267925
2,2,139892,16.623149,3.04057
3,3,91788,18.530557,3.516674
4,4,61136,23.436246,4.870803


In [22]:
agg_dow = (
    df.groupby("pickup_dow")
      .agg(trips=("VendorID","size"),
           avg_fare=("fare_amount","mean"),
           avg_distance=("trip_distance","mean"))
      .reset_index().sort_values("trips", ascending=False)
)
# Order days of week
dow_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
agg_dow["pickup_dow"] = pd.Categorical(agg_dow["pickup_dow"], categories=dow_order, ordered=True)
agg_dow = agg_dow.sort_values("pickup_dow")
agg_dow

Unnamed: 0,pickup_dow,trips,avg_fare,avg_distance
1,Monday,1573196,19.711422,3.775207
5,Tuesday,1796631,18.615656,4.246368
6,Wednesday,1923408,18.453934,3.609234
4,Thursday,1665097,18.603859,3.540862
0,Friday,1585684,18.195373,3.676451
2,Saturday,1629000,17.248652,3.387855
3,Sunday,1306964,18.713754,3.90294


In [18]:
agg_zone = (
    df.groupby(["PU_Borough","PU_Zone"])
      .agg(trips=("VendorID","size"),
           avg_total=("total_amount","mean"),
           avg_tip=("tip_amount","mean"),
           avg_distance=("trip_distance","mean"))
      .reset_index()
      .sort_values("trips", ascending=False)
)
agg_zone.head(15)

Unnamed: 0,PU_Borough,PU_Zone,trips,avg_total,avg_tip,avg_distance
143,Manhattan,Midtown Center,560624,23.954379,3.126143,2.599783
162,Manhattan,Upper East Side South,560511,19.782295,2.621996,1.715703
202,Queens,JFK Airport,553948,80.860175,9.190332,15.942517
161,Manhattan,Upper East Side North,535896,20.258314,2.676041,1.870868
144,Manhattan,Midtown East,417404,23.329076,3.062477,2.258899
156,Manhattan,Times Sq/Theatre District,411872,26.866743,3.353132,2.971887
149,Manhattan,Penn Station/Madison Sq West,408636,24.105197,3.129165,2.303059
135,Manhattan,Lincoln Square East,407200,21.331356,2.827742,2.117785
209,Queens,LaGuardia Airport,350843,66.365069,8.792835,9.711326
164,Manhattan,Upper West Side South,345892,21.215504,2.826878,2.293264


In [23]:
pay_unknown_by_hour = (
    df.groupby("pickup_hour")
      .agg(trips=("VendorID","size"),
           unknown_payments=("flag_unknown_payment_type","sum"))
      .reset_index()
)
pay_unknown_by_hour["unknown_payment_rate"] = pay_unknown_by_hour["unknown_payments"] / pay_unknown_by_hour["trips"]
pay_unknown_by_hour.head()

Unnamed: 0,pickup_hour,trips,unknown_payments,unknown_payment_rate
0,0,300979,21368,0.070995
1,1,201948,18632,0.092261
2,2,139892,10596,0.075744
3,3,91788,8636,0.094086
4,4,61136,9464,0.154802


In [24]:
marts_dir = ROOT / "data" / "processed" / "marts"
marts_dir.mkdir(parents=True, exist_ok=True)

agg_hour.to_parquet(marts_dir / "agg_hour.parquet", index=False)
agg_dow.to_parquet(marts_dir / "agg_dow.parquet", index=False)
agg_zone.to_parquet(marts_dir / "agg_zone.parquet", index=False)
pay_unknown_by_hour.to_parquet(marts_dir / "pay_unknown_by_hour.parquet", index=False)

In [25]:
agg_hour.shape, agg_dow.shape, agg_zone.shape, pay_unknown_by_hour.shape

((24, 4), (7, 4), (255, 6), (24, 4))

In [26]:
test = pd.read_parquet(marts_dir / "agg_hour.parquet")
test.shape

(24, 4)

In [27]:
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'Airport_fee',
       'flag_unknown_payment_type', 'flag_long_duration',
       'flag_extreme_distance', 'flag_high_passenger_count', 'pickup_year',
       'pickup_month', 'PU_Borough', 'PU_Zone', 'PU_service_zone',
       'pickup_hour', 'pickup_dow', 'pickup_date'],
      dtype='object')