In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd

In [2]:
# paths for the three dfs
df_jan_path = "yellow_tripdata_2016-01.csv"
df_feb_path = "yellow_tripdata_2016-02.csv"
df_mar_path = "yellow_tripdata_2016-03.csv"

# load the dataframes
df_jan = dd.read_csv(df_jan_path, assume_missing=True, usecols=['trip_distance', 'tpep_pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'fare_amount'], parse_dates=["tpep_pickup_datetime"])

df_feb = dd.read_csv(df_feb_path, assume_missing=True, usecols=['trip_distance', 'tpep_pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'fare_amount'], parse_dates=["tpep_pickup_datetime"])

df_mar = dd.read_csv(df_mar_path, assume_missing=True, usecols=['trip_distance', 'tpep_pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'fare_amount'], parse_dates=["tpep_pickup_datetime"])

In [3]:
# concat the three dataframes as one
df_final = dd.concat([df_jan, df_feb, df_mar], axis=0)

In [None]:
# combined dataframe
df_final

Unnamed: 0_level_0,tpep_pickup_datetime,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,fare_amount
npartitions=82,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,datetime64[ns],float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...


In [5]:
# set the values of coordinates
min_latitude = 40.60
max_latitude = 40.85

min_longitude = -74.05
max_longitude = -73.70

min_fare_amount_val = 0.50
max_fare_amount_val = 81.0

min_trip_distance_val = 0.25
max_trip_distance_val = 24.43

In [6]:
# select data points within the given ranges
df_final = df_final.loc[(df_final["pickup_latitude"].between(min_latitude, max_latitude, inclusive="both")) & (df_final["pickup_longitude"].between(min_longitude, max_longitude, inclusive="both")) & (df_final["dropoff_latitude"].between(min_latitude, max_latitude, inclusive="both")) & (df_final["dropoff_longitude"].between(min_longitude, max_longitude, inclusive="both")), :]

In [7]:
# select data points within the given ranges
df_final = df_final.loc[(df_final["fare_amount"].between(min_fare_amount_val,max_fare_amount_val,inclusive="both")) & (df_final["trip_distance"].between(min_trip_distance_val,max_trip_distance_val,inclusive="both"))]

In [8]:
# drop columns from the data
df_final = df_final.drop(columns=['trip_distance', 'dropoff_longitude', 'dropoff_latitude', 'fare_amount'])

In [9]:
# required dataframe
df_final

Unnamed: 0_level_0,tpep_pickup_datetime,pickup_longitude,pickup_latitude
npartitions=82,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,datetime64[ns],float64,float64
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


In [10]:
# converting to pandas dataframe
df_final = df_final.compute()

In [11]:
# save the dataframe
save_path = "processing_data.csv"
df_final.to_csv(save_path, index=False)

In [12]:
# Importing necessary libraries
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler

In [14]:
# Accessing the processed_data dataframe using pandas chunking
df_reader = pd.read_csv(save_path, chunksize=100000, usecols=["pickup_latitude","pickup_longitude"])

In [15]:
# train the standard scaler
scaler = StandardScaler()
for chunk in df_reader:
    # fit the scaler
    scaler.partial_fit(chunk)
scaler

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [17]:
# Accessing the processed_data dataframe using pandas chunking
df_reader = pd.read_csv(save_path, chunksize=100000, usecols=["pickup_latitude","pickup_longitude"])

In [18]:
# train the model
mini_batch = MiniBatchKMeans(n_clusters=30, n_init=10, random_state=42)

for chunk in df_reader:
    # scale the chunk
    scaled_chunk = scaler.transform(chunk)
    # train the model
    mini_batch.partial_fit(scaled_chunk)

mini_batch

0,1,2
,n_clusters,30
,init,'k-means++'
,max_iter,100
,batch_size,1024
,verbose,0
,compute_labels,True
,random_state,42
,tol,0.0
,max_no_improvement,10
,init_size,


In [19]:
# Clusters in actual longitude latitude scale
scaler.inverse_transform(mini_batch.cluster_centers_)[:5]

array([[-73.94975046,  40.80392392],
       [-73.97685385,  40.74726528],
       [-73.92094427,  40.69670159],
       [-73.78474354,  40.64663525],
       [-74.00391496,  40.7193993 ]])

In [21]:
# perform predictions and assign clusters
location_subset = df_final[df_final.columns[1:]]
location_subset.head(5)

Unnamed: 0,pickup_longitude,pickup_latitude
0,-73.990372,40.734695
1,-73.980782,40.729912
2,-73.98455,40.679565
3,-73.993469,40.71899
4,-73.960625,40.78133


In [22]:
# scale the input data
scaled_location_subset = scaler.transform(location_subset)

In [23]:
# get the cluster predictions
cluster_predictions = mini_batch.predict(scaled_location_subset)
cluster_predictions.shape

(33234199,)

In [24]:
# save the cluster predictions in data
df_final['region'] = cluster_predictions
df_final.head(5)

Unnamed: 0,tpep_pickup_datetime,pickup_longitude,pickup_latitude,region
0,2016-01-01,-73.990372,40.734695,7
1,2016-01-01,-73.980782,40.729912,26
2,2016-01-01,-73.98455,40.679565,9
3,2016-01-01,-73.993469,40.71899,10
4,2016-01-01,-73.960625,40.78133,8


In [25]:
# drop the latitude and logitude columns from data
time_series_data = df_final.drop(columns=["pickup_latitude","pickup_longitude"])
time_series_data.head()

Unnamed: 0,tpep_pickup_datetime,region
0,2016-01-01,7
1,2016-01-01,26
2,2016-01-01,9
3,2016-01-01,10
4,2016-01-01,8


In [26]:
# save the time series data
save_path = "time_series.csv"
time_series_data.to_csv(save_path, index=False)

In [27]:
# set the time series as the index
time_series_data.set_index('tpep_pickup_datetime', inplace=True)
time_series_data.head()

Unnamed: 0_level_0,region
tpep_pickup_datetime,Unnamed: 1_level_1
2016-01-01,7
2016-01-01,26
2016-01-01,9
2016-01-01,10
2016-01-01,8


In [28]:
# group the time series data based on regions
region_grp = time_series_data.groupby("region")
region_grp

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002F9116A1E80>

In [29]:
# check for missing values
time_series_data.isna().sum()

region    0
dtype: int64

In [30]:
# resample the time series in 15 minute intervals
resampled_data = (
    region_grp['region']
    .resample("15min")
    .count()
)
resampled_data

region  tpep_pickup_datetime
0       2016-01-01 00:00:00      58
        2016-01-01 00:15:00     120
        2016-01-01 00:30:00     149
        2016-01-01 00:45:00     160
        2016-01-01 01:00:00     187
                               ... 
29      2016-03-31 22:45:00      14
        2016-03-31 23:00:00      17
        2016-03-31 23:15:00      18
        2016-03-31 23:30:00      13
        2016-03-31 23:45:00      14
Name: region, Length: 262080, dtype: int64

> ### Key points about `resample()` :
> - Works on datetime-indexed DataFrames/Series
> - Similar to groupby() but for time periods
> - Common frequencies: 'D' (daily), 'H' (hourly), '15min' (15 minutes)
> - Must be followed by an aggregation method like .count() , .sum() , .mean() <br>

> *When you see "2016-01-01 00:00:00" as the first record in your resampled data, it represents the time bin from 00:00:00 to 00:15:00*

In [None]:
# For assigning column name in the dataframe 
resampled_data.name = "total_pickups"

In [32]:
resampled_data = resampled_data.reset_index(level=0)
resampled_data.head()

Unnamed: 0_level_0,region,total_pickups
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-01-01 00:00:00,0,58
2016-01-01 00:15:00,0,120
2016-01-01 00:30:00,0,149
2016-01-01 00:45:00,0,160
2016-01-01 01:00:00,0,187


In [34]:
# zeros in the data
(resampled_data['total_pickups'] == 0).sum()

np.int64(3668)

In [None]:
epsilon_val = 10
resampled_data.replace({'total_pickups': {0 : epsilon_val}}, inplace=True)
(resampled_data['total_pickups'] == 0).sum()

np.int64(0)

> *Replacing the values of 0 pickups to 10 pickups because EWMA values become infinity*

---

> ### `Smoothing` - `Moving Average`

In [None]:
# Setting various window size
window_values = list(range(3,11,1))
window_values

[3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
# Calculation of moving average and the corrosponding MAPE
from sklearn.metrics import mean_absolute_percentage_error
def calculate_best_window_value(windows):
    for window in windows:
        ind = window - 1
        y_pred = resampled_data['total_pickups'].rolling(window=window).mean().values[ind:]
        y = resampled_data['total_pickups'].values[ind:]
        error = mean_absolute_percentage_error(y, y_pred)
        print(f"For window value {window}, the MAPE is {error:.2f}")

In [43]:
# moving average and the corrosponding MAPE
calculate_best_window_value(window_values)

For window value 3, the MAPE is 0.20
For window value 4, the MAPE is 0.24
For window value 5, the MAPE is 0.28
For window value 6, the MAPE is 0.31
For window value 7, the MAPE is 0.35
For window value 8, the MAPE is 0.39
For window value 9, the MAPE is 0.42
For window value 10, the MAPE is 0.46


> ### `Smoothing` - `Exponentionally Weighted Moving Average`
> Why Exponential Smoothing?<br>
> - Problem: Raw data is noisy (like random spikes/drops). If you use a simple average (e.g., (100+500+150+600)/4 = 337), it ignores recent trends.<br>
>
> - Example: Day 4 had 600 rides, but the average (337) is heavily dragged down by older data (Day 1’s 100 rides).<br>
> 
> - Solution: Exponential smoothing gives more weight to recent data and less to older data.<br>
>
> - It assumes: "What happened yesterday is more important than what happened a week ago."

In [44]:
# Alpha values
smoothing_values = np.arange(0.2,1,0.1)
smoothing_values

array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [None]:
# Calculation of moving average and the corrosponding MAPE
def calculate_best_smoothing_value(values):
    y = resampled_data['total_pickups'].values
    for value in values:
        y_pred = resampled_data['total_pickups'].ewm(alpha=value).mean()
        error = mean_absolute_percentage_error(y, y_pred)
        print(f"For smoothing value {value:.1f}, the MAPE is {error:.2f}")

In [None]:
# moving average and the corrosponding MAPE
calculate_best_smoothing_value(smoothing_values)

For smoothing value 0.2, the MAPE is 0.41
For smoothing value 0.3, the MAPE is 0.27
For smoothing value 0.4, the MAPE is 0.20
For smoothing value 0.5, the MAPE is 0.16
For smoothing value 0.6, the MAPE is 0.12
For smoothing value 0.7, the MAPE is 0.09
For smoothing value 0.8, the MAPE is 0.06
For smoothing value 0.9, the MAPE is 0.03


In [49]:
# dataset with pickup smoothing applied
resampled_data["avg_pickups"] = resampled_data['total_pickups'].ewm(alpha=0.4).mean().round()
resampled_data.head()

Unnamed: 0_level_0,region,total_pickups,avg_pickups
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-01-01 00:00:00,0,58,58.0
2016-01-01 00:15:00,0,120,97.0
2016-01-01 00:30:00,0,149,123.0
2016-01-01 00:45:00,0,160,140.0
2016-01-01 01:00:00,0,187,161.0


> *Using an alpha value of 0.4, the exponentially weighted average pickups effectively models the pattern of total pickups, achieving a Mean Absolute Percentage Error (MAPE) of 20%.*

In [50]:
# save the resampled data
resampled_data_save_path = "final_data.csv"
resampled_data.to_csv(resampled_data_save_path, index=True)

In [51]:
# shape of the data
resampled_data.shape

(262080, 3)