In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

ROOT = Path("..")
raw_parquet = ROOT / "data" / "raw" / "yellow_tripdata_2024-01.parquet"
zone_csv = ROOT / "data" / "raw" / "taxi_zone_lookup.csv"

df = pd.read_parquet(raw_parquet)
zones = pd.read_csv(zone_csv)

df.head(), zones.head()


(   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
 0         2  2024-01-01 00:57:55   2024-01-01 01:17:43              1.0   
 1         1  2024-01-01 00:03:00   2024-01-01 00:09:36              1.0   
 2         1  2024-01-01 00:17:06   2024-01-01 00:35:01              1.0   
 3         1  2024-01-01 00:36:38   2024-01-01 00:44:56              1.0   
 4         1  2024-01-01 00:46:51   2024-01-01 00:52:57              1.0   
 
    trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
 0           1.72         1.0                  N           186            79   
 1           1.80         1.0                  N           140           236   
 2           4.70         1.0                  N           236            79   
 3           1.40         1.0                  N            79           211   
 4           0.80         1.0                  N           211           148   
 
    payment_type  fare_amount  extra  mta_tax  tip_amount  t

In [2]:
print("Rows, Cols:", df.shape)
df.info()


Rows, Cols: (2964624, 19)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2964624 entries, 0 to 2964623
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int32         
 8   DOLocationID           int32         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 

In [3]:
missing = df.isna().mean().sort_values(ascending=False)
missing[missing > 0].head(20)

store_and_fwd_flag      0.047278
RatecodeID              0.047278
passenger_count         0.047278
Airport_fee             0.047278
congestion_surcharge    0.047278
dtype: float64

In [4]:
df.describe(include="all").T.head(30)


Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
VendorID,2964624.0,,,,1.754204,1.0,2.0,2.0,2.0,6.0,0.43259
tpep_pickup_datetime,2964624.0,,,,2024-01-17 00:46:36.431092,2002-12-31 22:59:39,2024-01-09 15:59:19.750000,2024-01-17 10:45:37.500000,2024-01-24 18:23:52.250000,2024-02-01 00:01:15,
tpep_dropoff_datetime,2964624.0,,,,2024-01-17 01:02:13.208130,2002-12-31 23:05:41,2024-01-09 16:16:23,2024-01-17 11:03:51.500000,2024-01-24 18:40:29,2024-02-02 13:56:52,
passenger_count,2824462.0,,,,1.339281,0.0,1.0,1.0,1.0,9.0,0.850282
trip_distance,2964624.0,,,,3.652169,0.0,1.0,1.68,3.11,312722.3,225.462572
RatecodeID,2824462.0,,,,2.069359,1.0,1.0,1.0,1.0,99.0,9.823219
store_and_fwd_flag,2824462.0,2.0,N,2813126.0,,,,,,,
PULocationID,2964624.0,,,,166.017884,1.0,132.0,162.0,234.0,265.0,63.623914
DOLocationID,2964624.0,,,,165.116712,1.0,114.0,162.0,234.0,265.0,69.31535
payment_type,2964624.0,,,,1.161271,0.0,1.0,1.0,1.0,4.0,0.580869


In [5]:
# Identify pickup/dropoff datetime column names (TLC uses these names for yellow)
dt_cols = [c for c in df.columns if "datetime" in c.lower() or "pickup" in c.lower() or "dropoff" in c.lower()]
dt_cols


['tpep_pickup_datetime', 'tpep_dropoff_datetime']