## Import Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd

import logging
logging.getLogger("distributed.worker.memory").setLevel(logging.CRITICAL)
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(
    n_workers=4,
    threads_per_worker=2,
    memory_limit='4GB'
)
client = Client(cluster)


import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

## Loading Data

In [2]:
import os
print(os.getcwd())

/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/1_Data_Preprocess/a_EDA


In [3]:
ddf = dd.read_parquet("../../3_Data/raw/yellow_tripdata_2025-01_(january).parquet", npartitions=1)
ddf.compute().head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,1.0,N,229,237,1,10.0,3.5,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,1.0,N,236,237,1,5.1,3.5,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,1.0,N,141,141,1,5.1,3.5,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0
3,2,2025-01-01 00:14:27,2025-01-01 00:20:01,3.0,0.52,1.0,N,244,244,2,7.2,1.0,0.5,0.0,0.0,1.0,9.7,0.0,0.0,0.0
4,2,2025-01-01 00:21:34,2025-01-01 00:25:06,3.0,0.66,1.0,N,244,116,2,5.8,1.0,0.5,0.0,0.0,1.0,8.3,0.0,0.0,0.0


In [4]:
ddf.compute().head().T

Unnamed: 0,0,1,2,3,4
VendorID,1,1,1,2,2
tpep_pickup_datetime,2025-01-01 00:18:38,2025-01-01 00:32:40,2025-01-01 00:44:04,2025-01-01 00:14:27,2025-01-01 00:21:34
tpep_dropoff_datetime,2025-01-01 00:26:59,2025-01-01 00:35:13,2025-01-01 00:46:01,2025-01-01 00:20:01,2025-01-01 00:25:06
passenger_count,1.0,1.0,1.0,3.0,3.0
trip_distance,1.6,0.5,0.6,0.52,0.66
RatecodeID,1.0,1.0,1.0,1.0,1.0
store_and_fwd_flag,N,N,N,N,N
PULocationID,229,236,141,244,244
DOLocationID,237,237,141,244,116
payment_type,1,1,1,2,2


In [5]:
# let's set "tpep_pickup_datetime" as index

In [6]:
# Convert to datetime if not already
ddf['tpep_pickup_datetime'] = dd.to_datetime(ddf['tpep_pickup_datetime'])

# Set as index
ddf = ddf.set_index('tpep_pickup_datetime', sorted=True)

# Persist (optional, helps for faster future operations)
ddf = ddf.persist()

ddf.compute().head().T

tpep_pickup_datetime,2025-01-01 00:18:38,2025-01-01 00:32:40,2025-01-01 00:44:04,2025-01-01 00:14:27,2025-01-01 00:21:34
VendorID,1,1,1,2,2
tpep_dropoff_datetime,2025-01-01 00:26:59,2025-01-01 00:35:13,2025-01-01 00:46:01,2025-01-01 00:20:01,2025-01-01 00:25:06
passenger_count,1.0,1.0,1.0,3.0,3.0
trip_distance,1.6,0.5,0.6,0.52,0.66
RatecodeID,1.0,1.0,1.0,1.0,1.0
store_and_fwd_flag,N,N,N,N,N
PULocationID,229,236,141,244,244
DOLocationID,237,237,141,244,116
payment_type,1,1,1,2,2
fare_amount,10.0,5.1,5.1,7.2,5.8


In [7]:
ddf.compute().tail()

Unnamed: 0_level_0,VendorID,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2025-01-31 23:01:48,2,2025-01-31 23:16:29,,3.35,,,79,237,0,15.85,0.0,0.5,0.0,0.0,1.0,20.6,,,0.75
2025-01-31 23:50:29,2,2025-02-01 00:17:27,,8.73,,,161,116,0,28.14,0.0,0.5,0.0,0.0,1.0,32.89,,,0.75
2025-01-31 23:26:59,2,2025-01-31 23:43:01,,2.64,,,144,246,0,14.91,0.0,0.5,0.0,0.0,1.0,19.66,,,0.75
2025-01-31 23:14:34,2,2025-01-31 23:34:52,,3.16,,,142,107,0,17.55,0.0,0.5,0.0,0.0,1.0,22.3,,,0.75
2025-01-31 23:56:42,2,2025-02-01 00:07:27,,2.29,,,237,238,0,12.09,0.0,0.5,0.0,0.0,1.0,16.09,,,0.0


In [8]:
ddf.compute().info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3475226 entries, 2025-01-01 00:18:38 to 2025-01-31 23:56:42
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_dropoff_datetime  datetime64[us]
 2   passenger_count        float64       
 3   trip_distance          float64       
 4   RatecodeID             float64       
 5   store_and_fwd_flag     string        
 6   PULocationID           int32         
 7   DOLocationID           int32         
 8   payment_type           int64         
 9   fare_amount            float64       
 10  extra                  float64       
 11  mta_tax                float64       
 12  tip_amount             float64       
 13  tolls_amount           float64       
 14  improvement_surcharge  float64       
 15  total_amount           float64       
 16  congestion_surcharge   float64       
 17  Airport_fee            float64  

In [9]:
# There is dtypes: datetime64[us](1), float64(13), int32(3), int64(1), string(1)
# 3475226 rows, 19 columns

In [10]:
ddf.compute().describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
VendorID,3475226.0,1.785428,1.0,2.0,2.0,2.0,7.0,0.426328
tpep_dropoff_datetime,3475226.0,2025-01-17 11:17:56.997901,2024-12-18 07:52:40,2025-01-10 08:15:29.500000,2025-01-17 15:59:34,2025-01-24 19:48:31,2025-02-01 23:44:11,
passenger_count,2935077.0,1.297859,0.0,1.0,1.0,1.0,9.0,0.75075
trip_distance,3475226.0,5.855126,0.0,0.98,1.67,3.1,276423.57,564.6016
RatecodeID,2935077.0,2.482535,1.0,1.0,1.0,1.0,99.0,11.632772
PULocationID,3475226.0,165.191576,1.0,132.0,162.0,234.0,265.0,64.529483
DOLocationID,3475226.0,164.125177,1.0,113.0,162.0,234.0,265.0,69.401686
payment_type,3475226.0,1.036623,0.0,1.0,1.0,1.0,5.0,0.701333
fare_amount,3475226.0,17.081803,-900.0,8.6,12.11,19.5,863372.12,463.472918
extra,3475226.0,1.317737,-7.5,0.0,0.0,2.5,15.0,1.861509


In [11]:
ddf.compute().columns

Index(['VendorID', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
       'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID',
       'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
       'tolls_amount', 'improvement_surcharge', 'total_amount',
       'congestion_surcharge', 'Airport_fee', 'cbd_congestion_fee'],
      dtype='object')

In [12]:
# We just need 'VendorID','tpep_pickup_datetime','passenger_count' this three columns for this project

In [13]:
# Check dataset shape and columns
print("Rows:", len(ddf))
print("Columns:", len(ddf.columns))
print("Columns_Name:", list(ddf.columns))

Rows: 3475226
Columns: 19
Columns_Name: ['VendorID', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'Airport_fee', 'cbd_congestion_fee']


In [14]:
resample = ddf.resample('h').agg({'passenger_count': 'sum', 'fare_amount': 'median', 'VendorID': 'count'})
resample.compute()

Unnamed: 0_level_0,passenger_count,fare_amount,VendorID
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-12-31 20:00:00,6.0,23.300,3
2024-12-31 21:00:00,6.0,15.600,3
2024-12-31 22:00:00,0.0,,0
2024-12-31 23:00:00,28.0,14.900,15
2025-01-01 00:00:00,9132.0,13.285,7344
...,...,...,...
2025-01-31 20:00:00,7386.0,12.100,7298
2025-01-31 21:00:00,8632.0,12.100,7201
2025-01-31 22:00:00,8491.0,12.800,8569
2025-01-31 23:00:00,6917.0,12.460,8360


In [15]:
resample.rename(columns={'passenger_count': 'passenger_demand','VendorID': 'taxi_demand'}).compute()

Unnamed: 0_level_0,passenger_demand,fare_amount,taxi_demand
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-12-31 20:00:00,6.0,23.300,3
2024-12-31 21:00:00,6.0,15.600,3
2024-12-31 22:00:00,0.0,,0
2024-12-31 23:00:00,28.0,14.900,15
2025-01-01 00:00:00,9132.0,13.285,7344
...,...,...,...
2025-01-31 20:00:00,7386.0,12.100,7298
2025-01-31 21:00:00,8632.0,12.100,7201
2025-01-31 22:00:00,8491.0,12.800,8569
2025-01-31 23:00:00,6917.0,12.460,8360


In [16]:
#Trip duration (in minutes)
df = ddf.compute()
df['trip_duration_min'] = (
    pd.to_datetime(df['tpep_dropoff_datetime']) -
    df.index.to_series()
).dt.total_seconds() / 60

df[['trip_distance', 'trip_duration_min', 'total_amount']].head()


Unnamed: 0_level_0,trip_distance,trip_duration_min,total_amount
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-01-01 00:18:38,1.6,8.35,18.0
2025-01-01 00:32:40,0.5,2.55,12.12
2025-01-01 00:44:04,0.6,1.95,12.1
2025-01-01 00:14:27,0.52,5.566667,9.7
2025-01-01 00:21:34,0.66,3.533333,8.3


In [17]:
# Add common time features
df['hour'] = df.index.hour
df['day'] = df.index.day
df['weekday'] = df.index.weekday       # Monday=0, Sunday=6
df['week_of_year'] = df.index.isocalendar().week
df['month'] = df.index.month

# Check the first few rows
df[['hour', 'day', 'weekday', 'week_of_year', 'month']].head()


Unnamed: 0_level_0,hour,day,weekday,week_of_year,month
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-01-01 00:18:38,0,1,2,1,1
2025-01-01 00:32:40,0,1,2,1,1
2025-01-01 00:44:04,0,1,2,1,1
2025-01-01 00:14:27,0,1,2,1,1
2025-01-01 00:21:34,0,1,2,1,1


In [18]:
def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['hour'] = df.index.hour
    df['day'] = df.index.day
    df['month'] = df.index.month
    df['dayofweek'] = df.index.dayofweek
    df['week_of_year'] = df.index.isocalendar().week
    return df


In [19]:
df = add_time_features(df)
df[['hour','day','dayofweek','week_of_year','month']].head()

Unnamed: 0_level_0,hour,day,dayofweek,week_of_year,month
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-01-01 00:18:38,0,1,2,1,1
2025-01-01 00:32:40,0,1,2,1,1
2025-01-01 00:44:04,0,1,2,1,1
2025-01-01 00:14:27,0,1,2,1,1
2025-01-01 00:21:34,0,1,2,1,1


In [20]:
# Ensure Pandas DF with DatetimeIndex
resample = resample.compute()  # convert Dask DF to Pandas
resample.index = pd.to_datetime(resample.index)  # make sure index is datetime

# Fill missing hourly rows
resample = resample.asfreq('h', fill_value=0)

resample.head()


Unnamed: 0_level_0,passenger_count,fare_amount,VendorID
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-12-31 20:00:00,6.0,23.3,3
2024-12-31 21:00:00,6.0,15.6,3
2024-12-31 22:00:00,0.0,,0
2024-12-31 23:00:00,28.0,14.9,15
2025-01-01 00:00:00,9132.0,13.285,7344


# Optimize For Memory

In [21]:
! ls -lrt /media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/*.parquet | wc -l

9


In [22]:
! ls -lrt /media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/*.parquet 

-rwxrwxrwx 1 sheikh sheikh 59158238 Nov  1 19:49 '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-01_(january).parquet'
-rwxrwxrwx 1 sheikh sheikh 60343086 Nov  1 19:50 '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-02_(february).parquet'
-rwxrwxrwx 1 sheikh sheikh 69964745 Nov  1 19:51 '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-03_(march).parquet'
-rwxrwxrwx 1 sheikh sheikh 67352824 Nov  1 19:52 '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-04_(april).parquet'
-rwxrwxrwx 1 sheikh sheikh 77837865 Nov  1 19:52 '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-05_(may).parquet'
-rwxrwxrwx 1 sheikh sheikh 73542954 Nov  1 19:52 '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripd

In [23]:
cluster = LocalCluster(
    name="yellow-taxi-cluster",
    n_workers=2,              # good for 4-core system
    threads_per_worker=2,     # adjust depending on your CPU
    memory_limit="4GB",       # prevent overuse; you can tweak it
    dashboard_address=":8787" # open dashboard at localhost:8787
)

client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:41333/status,

0,1
Dashboard: http://127.0.0.1:41333/status,Workers: 2
Total threads: 4,Total memory: 7.45 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:44587,Workers: 0
Dashboard: http://127.0.0.1:41333/status,Total threads: 0
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://127.0.0.1:38011,Total threads: 2
Dashboard: http://127.0.0.1:43269/status,Memory: 3.73 GiB
Nanny: tcp://127.0.0.1:33873,
Local directory: /tmp/dask-scratch-space/worker-du1k4ddh,Local directory: /tmp/dask-scratch-space/worker-du1k4ddh

0,1
Comm: tcp://127.0.0.1:34225,Total threads: 2
Dashboard: http://127.0.0.1:33553/status,Memory: 3.73 GiB
Nanny: tcp://127.0.0.1:46755,
Local directory: /tmp/dask-scratch-space/worker-jef7a35r,Local directory: /tmp/dask-scratch-space/worker-jef7a35r


In [24]:
############## Steps ####################

In [25]:
def optimize_to_fit_memory(ddf):
    """
    Downcast numeric columns to smaller dtypes to save memory.
    """
    # Define which columns to convert
    new_types = {
        'int32': ['passenger_count'],
        'int16': ['VendorID']
    }

    for dtype, cols in new_types.items():
        for col in cols:
            if col in ddf.columns:
                ddf[col] = ddf[col].astype(dtype)
    return ddf

In [26]:
# Load parquet file
ddf = dd.read_parquet(
    "/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-01_(january).parquet",
    engine="pyarrow"
)

# Select only needed columns
ddf = ddf.loc[
    ddf.tpep_pickup_datetime > pd.Timestamp('2024-12-31 23:00:00'),
    ['VendorID', 'tpep_pickup_datetime', 'passenger_count']
]

# Ensure datetime column type
ddf['tpep_pickup_datetime'] = dd.to_datetime(ddf['tpep_pickup_datetime'])

# Sort by datetime and set as index
ddf = ddf.set_index('tpep_pickup_datetime', sorted=True)

# Fill missing timestamps (optional but useful)
ddf = ddf.ffill()

# Define known divisions explicitly
ddf = ddf.repartition(freq='1D')   # one partition per day (safe for resample)

# Persist in memory (optional but recommended for large data)
ddf = ddf.persist()

# Now safely resample
resampled = ddf.resample('1h').agg({
    'passenger_count': 'sum',
    'VendorID': 'count'
})

# Compute result (now works safely)
resampled = resampled.compute()

# Optional: Reset index and save
resampled = resampled.reset_index()
resampled = resampled.astype({'passenger_count': 'int32', 'VendorID': 'int16'})
resampled.to_parquet(
    "/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/processed/2025_jan_hourly.parquet",
    engine="pyarrow",
    index=True
)


In [27]:
files = [
    '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-01_(january).parquet',
    '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-02_(february).parquet',
    '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-03_(march).parquet',
    '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-04_(april).parquet',
    '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-05_(may).parquet',
    '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-06_(june).parquet',
    '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-07_(july).parquet',
    '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-08_(august).parquet',
    '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-09_(september).parquet'
]

In [28]:
# Memory optimization function
def optimize_memory(df: pd.DataFrame) -> pd.DataFrame:
    """
    Downcast numeric columns to save memory.
    """
    if 'passenger_count' in df.columns:
        df['passenger_count'] = df['passenger_count'].astype('int32')
    if 'VendorID' in df.columns:
        df['VendorID'] = df['VendorID'].astype('int16')
    return df

optimized_parts = []

for file in files:
    print(f"Processing: {file}")

    # Load necessary columns only
    ddf = dd.read_parquet(file, engine="pyarrow")[['VendorID', 'tpep_pickup_datetime', 'passenger_count']]

    # Compute to get Pandas DF
    df = ddf.compute()

    # Filter for dates if needed
    df = df.loc[df.tpep_pickup_datetime > pd.Timestamp('2024-12-31 23:00:00')]

    # Ensure datetime type
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])

    # Set index for resampling
    df = df.set_index('tpep_pickup_datetime')

    # Resample hourly
    resampled = df.resample('1h').agg({
        'passenger_count': 'sum',
        'VendorID': 'count'
    })

    # Reset index
    resampled = resampled.reset_index()

    # Optimize memory
    resampled = optimize_memory(resampled)

    # Append to list
    optimized_parts.append(resampled)

# Concatenate all months
final_df = pd.concat(optimized_parts, ignore_index=True)

# Save final hourly dataset
final_df.to_parquet(
    '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/processed/2025_hourly_all.parquet',
    engine='pyarrow',
    index=False
)

print("All months processed and saved successfully!")




Processing: /media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-01_(january).parquet
Processing: /media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-02_(february).parquet
Processing: /media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-03_(march).parquet
Processing: /media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-04_(april).parquet
Processing: /media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-05_(may).parquet
Processing: /media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-06_(june).parquet
Processing: /media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/raw/yellow_tripdata_2025-07_(july).parquet
Processing: /media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-ana

In [29]:
final_df.shape

(6573, 3)

In [30]:
# Rename desired columns name for better understanding
final_df = final_df.rename(columns={'passenger_count': 'passenger_demand', 'VendorID': 'taxi_demand'})
final_df.head()

Unnamed: 0,tpep_pickup_datetime,passenger_demand,taxi_demand
0,2024-12-31 23:00:00,28,15
1,2025-01-01 00:00:00,9132,7344
2,2025-01-01 01:00:00,8996,8468
3,2025-01-01 02:00:00,7364,7257
4,2025-01-01 03:00:00,4904,4915


In [31]:
final_df.tail()

Unnamed: 0,tpep_pickup_datetime,passenger_demand,taxi_demand
6568,2025-09-30 20:00:00,8882,9539
6569,2025-09-30 21:00:00,9048,9965
6570,2025-09-30 22:00:00,7026,8001
6571,2025-09-30 23:00:00,3984,4587
6572,2025-10-01 00:00:00,5,1


In [32]:
final_df.isnull().sum()

tpep_pickup_datetime    0
passenger_demand        0
taxi_demand             0
dtype: int64

In [33]:
final_df.drop([0,6572], inplace=True)

In [34]:
final_df.head()

Unnamed: 0,tpep_pickup_datetime,passenger_demand,taxi_demand
1,2025-01-01 00:00:00,9132,7344
2,2025-01-01 01:00:00,8996,8468
3,2025-01-01 02:00:00,7364,7257
4,2025-01-01 03:00:00,4904,4915
5,2025-01-01 04:00:00,3015,2918


# Save The Data

In [35]:
final_df.to_csv(
    '/media/sheikh/F262ADC762AD90C1/backup/ML/yellow-taxi-demand-analysis/3_Data/processed/2025_hourly_all_clean.csv',
    index=False,
)