In [1]:
import pandas as pd

In [13]:
# Read the parquet file
file_path = 'filepath/fhvhv_tripdata_2024-06.parquet'
df = pd.read_parquet(file_path)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20123226 entries, 0 to 20123225
Data columns (total 24 columns):
 #   Column                Dtype         
---  ------                -----         
 0   hvfhs_license_num     object        
 1   dispatching_base_num  object        
 2   originating_base_num  object        
 3   request_datetime      datetime64[ns]
 4   on_scene_datetime     datetime64[ns]
 5   pickup_datetime       datetime64[ns]
 6   dropoff_datetime      datetime64[ns]
 7   PULocationID          int32         
 8   DOLocationID          int32         
 9   trip_miles            float64       
 10  trip_time             int64         
 11  base_passenger_fare   float64       
 12  tolls                 float64       
 13  bcf                   float64       
 14  sales_tax             float64       
 15  congestion_surcharge  float64       
 16  airport_fee           float64       
 17  tips                  float64       
 18  driver_pay            float64       
 19

In [15]:
# List of columns to drop
columns_to_drop = [
    'hvfhs_license_num', 'dispatching_base_num', 'originating_base_num', 
    'request_datetime', 'on_scene_datetime', 'base_passenger_fare', 'tolls', 
    'bcf', 'sales_tax', 'congestion_surcharge', 'airport_fee', 'tips', 
    'driver_pay', 'shared_request_flag', 'shared_match_flag', 'access_a_ride_flag', 
    'wav_request_flag', 'wav_match_flag', 'dropoff_datetime', 'DOLocationID'
]

df = df.drop(columns=columns_to_drop)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20123226 entries, 0 to 20123225
Data columns (total 4 columns):
 #   Column           Dtype         
---  ------           -----         
 0   pickup_datetime  datetime64[ns]
 1   PULocationID     int32         
 2   trip_miles       float64       
 3   trip_time        int64         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1)
memory usage: 537.3 MB


In [17]:
# Removing rows with no pickup or dropoff times
df.dropna(subset=['pickup_datetime'], inplace=True)

In [18]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20123226 entries, 0 to 20123225
Data columns (total 4 columns):
 #   Column           Dtype         
---  ------           -----         
 0   pickup_datetime  datetime64[ns]
 1   PULocationID     int32         
 2   trip_miles       float64       
 3   trip_time        int64         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1)
memory usage: 537.3 MB


Unnamed: 0,PULocationID,trip_miles,trip_time
count,20123230.0,20123230.0,20123230.0
mean,138.0636,5.172981,1240.024
std,74.93934,5.990272,903.1166
min,2.0,0.0,0.0
25%,75.0,1.62,619.0
50%,138.0,3.1,1001.0
75%,209.0,6.49,1590.0
max,265.0,363.55,36156.0


In [19]:
# Notes from summary table: trips with zero miles and/or minutes
#                           outliers with very large distances and long times

# Filtering outliers from the data
cutoff_miles = df['trip_miles'].quantile(.995)
cutoff_time = df['trip_time'].quantile(.995)

df = df[(df['trip_miles'] > 0.05) &
        (df['trip_time'] >= 60) &
        (df['trip_miles'] <= cutoff_miles) &
        (df['trip_time'] <= cutoff_time)
       ] 

In [20]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19949628 entries, 0 to 20123225
Data columns (total 4 columns):
 #   Column           Dtype         
---  ------           -----         
 0   pickup_datetime  datetime64[ns]
 1   PULocationID     int32         
 2   trip_miles       float64       
 3   trip_time        int64         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1)
memory usage: 684.9 MB


Unnamed: 0,PULocationID,trip_miles,trip_time
count,19949630.0,19949630.0,19949630.0
mean,138.0168,4.906277,1205.94
std,75.04665,4.942874,812.0541
min,2.0,0.051,60.0
25%,74.0,1.61,616.0
50%,138.0,3.07,994.0
75%,209.0,6.35,1569.0
max,265.0,34.295,5149.0


In [21]:
# Notes from outlier removed summary table: significant decrease in std for time and distance
#                                           outliers had large effect on the variance

In [14]:
# Saving cleaned data
cleaned_file_path = 'filepath/cleaned_fhvhv_tripdata_2024-06.parquet'
df.to_parquet(cleaned_file_path)

In [23]:
# Saving sample of cleaned data for github
df_sample = df.sample(frac=0.02, random_state=42)
df_sample_filepath = 'filepath/sample_cleaned_fhvhv_tripdata_2024-06.parquet'
df_sample.to_parquet(df_sample_filepath)