In [1]:
pip install pandas pyarrow

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import os
import glob

def process_and_split_taxi_data(input_path):
    """
    Consolidated script for Yellow and Green taxi data.
    - Normalizes columns
    - Adds taxi_type
    - Splits into weekly Parquet files
    """
    filename = os.path.basename(input_path).lower()
    
    # 1. Dynamic Configuration based on Taxi Type
    if 'yellow' in filename:
        taxi_type = "yellow"
        mapping = {
            'tpep_pickup_datetime': 'pickup_datetime',
            'tpep_dropoff_datetime': 'dropoff_datetime'
        }
    elif 'green' in filename:
        taxi_type = "green"
        mapping = {
            'lpep_pickup_datetime': 'pickup_datetime',
            'lpep_dropoff_datetime': 'dropoff_datetime'
        }
    else:
        print(f"Skipping {input_path}: Could not determine taxi type from filename.")
        return

    print(f"\n--- Starting: {taxi_type.upper()} ---")

    # 2. Load Parquet into Memory
    try:
        df = pd.read_parquet(input_path)
    except Exception as e:
        print(f"Error reading {input_path}: {e}")
        return
    
    # 3. Add Metadata and Unify Schema
    df['taxi_type'] = taxi_type
    df = df.rename(columns=mapping)
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

    # 4. Define Weekly Intervals (October 2025 Demo Data)
    weeks = [
        ('2025-10-01', '2025-10-07', 'W1'),
        ('2025-10-08', '2025-10-14', 'W2'),
        ('2025-10-15', '2025-10-21', 'W3'),
        ('2025-10-22', '2025-10-31', 'W4')
    ]

    # Create output directory relative to the script location
    output_dir = "D:/NYC_Project/Split_Weekly_Files"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 5. Filter, Split, and Save
    for start, end, label in weeks:
        mask = (df['pickup_datetime'] >= start) & (df['pickup_datetime'] <= f"{end} 23:59:59")
        week_df = df.loc[mask].copy()
        week_df = week_df.drop(columns = 'Airport_fee',inplace=True, errors="ignore")
        
        if not week_df.empty:
            output_filename = os.path.join(output_dir, f"{taxi_type}_{label}.parquet")
            week_df.to_parquet(output_filename, index=False)
            print(f"Successfully Created: {output_filename} ({len(week_df)} rows)")
        else:
            print(f"No data found for {label} in {taxi_type}")

if __name__ == "__main__":
    # 1. Define your specific Windows folder path
    # Using 'r' prefix to handle backslashes correctly
    folder_path = r"D:/NYC_Project/Raw_Files"
    
    # 2. Automatically find all .parquet files in that folder
    search_pattern = os.path.join(folder_path, "*.parquet")
    files_to_process = glob.glob(search_pattern)
    
    if not files_to_process:
        print(f"No parquet files found in {folder_path}. Please check the path.")
    else:
        print(f"Found {len(files_to_process)} files. Starting batch process...")
        
        # 3. Process them one by one
        for file in files_to_process:
            process_and_split_taxi_data(file)
            
        print("\nAll files have been processed and split successfully.")

Found 2 files. Starting batch process...

--- Starting: GREEN ---
Successfully Created: D:/NYC_Project/Split_Weekly_Files\green_W1.parquet (10872 rows)
Successfully Created: D:/NYC_Project/Split_Weekly_Files\green_W2.parquet (10771 rows)
Successfully Created: D:/NYC_Project/Split_Weekly_Files\green_W3.parquet (10991 rows)
Successfully Created: D:/NYC_Project/Split_Weekly_Files\green_W4.parquet (16761 rows)

--- Starting: YELLOW ---
Successfully Created: D:/NYC_Project/Split_Weekly_Files\yellow_W1.parquet (939599 rows)
Successfully Created: D:/NYC_Project/Split_Weekly_Files\yellow_W2.parquet (1003683 rows)
Successfully Created: D:/NYC_Project/Split_Weekly_Files\yellow_W3.parquet (1005646 rows)
Successfully Created: D:/NYC_Project/Split_Weekly_Files\yellow_W4.parquet (1479758 rows)

All files have been processed and split successfully.
