In [5]:
import pandas as pd
import os
import geopandas as gpd

# Define the paths
INPUT_DIR = '/kaggle/input/nyc-taxi-dataset/Dataset/2022/high_volume_for_hire_vehicle'
TAXI_ZONES_PATH = '/kaggle/input/nyc-taxi-zones/NYC Taxi Zones.geojson'

# Columns and mapping
FEE_COLS = ['base_passenger_fare', 'tolls', 'bcf', 'sales_tax', 'congestion_surcharge', 'airport_fee']
DROP_COLS = ['dispatching_base_num', 'originating_base_num', 'on_scene_datetime', 'tips', 'shared_request_flag',
             'shared_match_flag', 'access_a_ride_flag', 'wav_request_flag', 'wav_match_flag', 'base_passenger_fare',
             'tolls', 'bcf', 'sales_tax', 'congestion_surcharge', 'airport_fee', 'hvfhs_license_num', 'driver_pay']
LICENSE_MAP = {
    "HV0002": "Juno",
    "HV0003": "Uber",
    "HV0004": "Via",
    "HV0005": "Lyft"
}

def process_batch(df, valid_locations):
    # Add new column 'fee' as the sum of the FEE_COLS
    df['fee'] = df[FEE_COLS].sum(axis=1)

    # Replace the values in 'hvfhs_license_num' and rename the column
    df['hvfhs'] = pd.Categorical(df['hvfhs_license_num'].replace(LICENSE_MAP))

    # Drop the specified columns
    df.drop(columns=DROP_COLS, inplace=True)

    # Filter rows where pickup and dropoff locations are in Manhattan
    df = df.loc[
        df["PULocationID"].isin(valid_locations) &
        df["DOLocationID"].isin(valid_locations)
    ].copy()

    return df

# Load the NYC Taxi Zones GeoJSON to identify Manhattan location IDs
gdf = gpd.read_file(TAXI_ZONES_PATH)
gdf = gdf[gdf['borough'] == 'Manhattan']
manhattan_locations = gdf['location_id'].astype(int).tolist()

combined_df = None  # Initialize as None

for filename in os.listdir(INPUT_DIR):
    if filename.endswith('.parquet'):
        file_path = os.path.join(INPUT_DIR, filename)
        # Read the .parquet file into a DataFrame
        df = pd.read_parquet(file_path)
        # Process the DataFrame with the Manhattan filter
        df = process_batch(df, manhattan_locations)
        
        if combined_df is None:
            combined_df = df  # Initialize with the first DataFrame
        else:
            combined_df = pd.concat([combined_df, df], ignore_index=True)

# Display the first few rows of the combined DataFrame
print(combined_df.head())

# Save the combined DataFrame to a CSV or Parquet file if needed
combined_df.to_parquet('/kaggle/working/combined_manhattan.parquet', index=False)
print("Combined DataFrame saved to /kaggle/working/combined_manhattan.parquet")


     request_datetime     pickup_datetime    dropoff_datetime  PULocationID  \
0 2023-01-01 00:18:06 2023-01-01 00:19:38 2023-01-01 00:48:07            48   
1 2023-01-01 00:48:42 2023-01-01 00:58:39 2023-01-01 01:33:08           246   
2 2023-01-01 00:06:54 2023-01-01 00:10:29 2023-01-01 00:18:22            90   
3 2023-01-01 00:15:22 2023-01-01 00:22:10 2023-01-01 00:33:14           125   
4 2023-01-01 00:26:02 2023-01-01 00:39:09 2023-01-01 01:03:50            68   

   DOLocationID  trip_miles  trip_time    fee hvfhs  
0            68        0.94       1709  31.78  Uber  
1           163        2.78       2069  70.03  Uber  
2           231        1.89        473  18.99  Uber  
3           246        2.65        664  17.29  Uber  
4           231        3.26       1481  36.74  Uber  
Combined DataFrame saved to /kaggle/working/combined_manhattan.parquet


In [6]:
from IPython.display import FileLink
FileLink(r'combined_manhattan.parquet')