In [73]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [74]:
# Loading the dataset
df = pd.read_csv('C:/Users/adivi/OneDrive/Documents/ML Project/yellow_tripdata_2015-01.csv/yellow_tripdata_2015-01.csv', nrows = 50000)
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-01-15 19:05:39,2015-01-15 19:23:42,1,1.59,-73.993896,40.750111,1,N,-73.974785,40.750618,1,12.0,1.0,0.5,3.25,0.0,0.3,17.05
1,1,2015-01-10 20:33:38,2015-01-10 20:53:28,1,3.3,-74.001648,40.724243,1,N,-73.994415,40.759109,1,14.5,0.5,0.5,2.0,0.0,0.3,17.8
2,1,2015-01-10 20:33:38,2015-01-10 20:43:41,1,1.8,-73.963341,40.802788,1,N,-73.95182,40.824413,2,9.5,0.5,0.5,0.0,0.0,0.3,10.8
3,1,2015-01-10 20:33:39,2015-01-10 20:35:31,1,0.5,-74.009087,40.713818,1,N,-74.004326,40.719986,2,3.5,0.5,0.5,0.0,0.0,0.3,4.8
4,1,2015-01-10 20:33:39,2015-01-10 20:52:58,1,3.0,-73.971176,40.762428,1,N,-74.004181,40.742653,2,15.0,0.5,0.5,0.0,0.0,0.3,16.3


In [75]:
# Convert datetime columns data type from object to datetime
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

# Duration in minutes
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60  
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_week'] = df['tpep_pickup_datetime'].dt.dayofweek

In [76]:
# Create Distance Categories
bins = [0, 1, 5, 10, float('inf')]
labels = ['short', 'medium', 'long', 'very long']
df['distance_category'] = pd.cut(df['trip_distance'], bins=bins, labels=labels)

# Display the engineered features
print(df[['trip_duration', 'pickup_hour', 'day_of_week', 'distance_category']].head())

   trip_duration  pickup_hour  day_of_week distance_category
0      18.050000           19            3            medium
1      19.833333           20            5            medium
2      10.050000           20            5            medium
3       1.866667           20            5             short
4      19.316667           20            5            medium


In [77]:
# Analyzing Long vs Short Rides
distance_summary = df.groupby(['VendorID', 'distance_category']).agg(
    avg_trip_distance=('trip_distance', 'mean'),
    count_trips=('trip_distance', 'count')
).reset_index()

# Displaying the summary
print(distance_summary)

   VendorID distance_category  avg_trip_distance  count_trips
0         1             short           0.713408         6735
1         1            medium           2.214003        14268
2         1              long           7.042782         1891
3         1         very long          15.246744         1029
4         2             short           0.703434         6194
5         2            medium           2.163946        16264
6         2              long           6.981328         2123
7         2         very long          15.188492         1200


  distance_summary = df.groupby(['VendorID', 'distance_category']).agg(


In [78]:
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor

In [98]:
# Create a unique pickup location identifier
df['pickup_location'] = df['pickup_latitude'].astype(str) + ',' + df['pickup_longitude'].astype(str)

# Feature Engineering: Group by hour, location, and day of week
# This helps to capture potential demand trends
demand_df = df.groupby(['pickup_hour', 'pickup_location', 'day_of_week']).size().reset_index(name='ride_count')

# Define Target Variable: Predicting the number of rides instead of using allocated drivers directly
# Use ride_count as the target variable
y = demand_df['ride_count']

# Define Features: Include pickup_hour, day_of_week, and a unique identifier for pickup_location
X = demand_df[['pickup_hour', 'day_of_week']]

# One-Hot Encoding for categorical variables
X_encoded = pd.get_dummies(X, drop_first=True)

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [100]:
# Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)

In [101]:
from sklearn.model_selection import RandomizedSearchCV

svr = SVR()
svr_param_dist = {
    'kernel': ['linear', 'rbf'],
    'C': [1, 10, 100],
    'epsilon': [0.1, 0.2, 0.5]
}
svr_random = RandomizedSearchCV(svr, svr_param_dist, n_iter=5, cv=5, random_state=42)
svr_random.fit(X_train, y_train)

In [102]:
knn = KNeighborsRegressor()
knn_param_dist = {
    'n_neighbors': [3, 5, 10],
    'metric': ['euclidean', 'manhattan']
}
knn_random = RandomizedSearchCV(knn, knn_param_dist, n_iter=5, cv=5, random_state=42)
knn_random.fit(X_train, y_train)

In [103]:
lgbm = LGBMRegressor()
lgbm_param_dist = {
    'num_leaves': [31, 50, 70],
    'learning_rate': [0.1, 0.01]
}
lgbm_random = RandomizedSearchCV(lgbm, lgbm_param_dist, n_iter=5, cv=5, random_state=42)
lgbm_random.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000420 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 31
[LightGBM] [Info] Number of data points in the train set: 27568, number of used features: 2
[LightGBM] [Info] Start training from score 1.015852
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000392 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 31
[LightGBM] [Info] Number of data points in the train set: 27569, number of used features: 2
[LightGBM] [Info] Start training from score 1.015996
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000335 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you

In [104]:
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return mae, rmse

In [105]:
# Evaluate models
results = {}
results['Random Forest'] = evaluate_model(y_test, rf_pred)
results['Gradient Boosting'] = evaluate_model(y_test, gb_pred)
results['SVR (Best Parameters)'] = evaluate_model(y_test, svr_random.predict(X_test))
results['KNN (Best Parameters)'] = evaluate_model(y_test, knn_random.predict(X_test))
results['LightGBM (Best Parameters)'] = evaluate_model(y_test, lgbm_random.predict(X_test))

# Display results
for model, metrics in results.items():
    print(f"{model} - MAE: {metrics[0]:.2f}, RMSE: {metrics[1]:.2f}")

Random Forest - MAE: 0.03, RMSE: 0.33
Gradient Boosting - MAE: 0.03, RMSE: 0.33
SVR (Best Parameters) - MAE: 0.11, RMSE: 0.34
KNN (Best Parameters) - MAE: 0.02, RMSE: 0.33
LightGBM (Best Parameters) - MAE: 0.03, RMSE: 0.33


In [109]:
def allocate_resources(predicted_demand):
    """Allocates resources based on the predicted demand."""
    if predicted_demand >= 30:
        return 5  
    elif predicted_demand >= 20:
        return 3  
    elif predicted_demand >= 10:
        return 2  
    else:
        return 1  

# Creating a DataFrame to store predictions and resource allocation
allocation_df = pd.DataFrame({
    'pickup_hour': X_test['pickup_hour'],
    'day_of_week': X_test['day_of_week'],
})

allocation_df['predicted_demand'] = rf_pred  

# Displaying allocations
print(allocation_df[['pickup_hour', 'day_of_week', 'predicted_demand']])

       pickup_hour  day_of_week  predicted_demand
28358           16            4          1.017967
13871           10            5          1.008282
12362            9            3          1.023380
47068           23            5          1.000000
47881           23            1          1.015522
...            ...          ...               ...
9296             8            1          1.025595
40327           20            4          1.034310
12461            9            4          1.031647
12495            9            5          1.043013
34337           18            0          1.031868

[14770 rows x 3 columns]


In [110]:
# Summarize the total drivers needed for each hour and day of the week
summary_allocation = allocation_df.groupby(['pickup_hour', 'day_of_week']).sum().reset_index()
x= pd.DataFrame(summary_allocation)
x

Unnamed: 0,pickup_hour,day_of_week,predicted_demand
0,0,0,45.000000
1,0,1,34.000000
2,0,2,66.601817
3,0,3,123.831983
4,0,4,73.000000
...,...,...,...
160,23,2,59.255721
161,23,3,122.430726
162,23,4,163.000000
163,23,5,213.000000


In [108]:
# Define the allocate_resources function
def allocate_resources(predicted_demand):
    """Allocates resources based on the predicted demand."""
    if predicted_demand >= 30:
        return 5  
    elif predicted_demand >= 20:
        return 3  
    elif predicted_demand >= 10:
        return 2  
    else:
        return 1  

# Creating a DataFrame to store predictions and resource allocation
allocation_df = pd.DataFrame({
    'pickup_hour': X_test['pickup_hour'],
    'day_of_week': X_test['day_of_week'],
})

# Assuming rf_pred is your model's predicted demand (e.g., total trip count or another measure)
allocation_df['predicted_demand'] = rf_pred

# Ensure that the predicted demand is correctly interpreted for resource allocation
# For example, use the mean or total predicted demand per hour/day/location
allocation_df['allocated_drivers'] = allocation_df['predicted_demand'].apply(allocate_resources)

# Displaying allocations
print(allocation_df[['pickup_hour', 'day_of_week', 'predicted_demand', 'allocated_drivers']])


       pickup_hour  day_of_week  predicted_demand  allocated_drivers
28358           16            4          1.017967                  1
13871           10            5          1.008282                  1
12362            9            3          1.023380                  1
47068           23            5          1.000000                  1
47881           23            1          1.015522                  1
...            ...          ...               ...                ...
9296             8            1          1.025595                  1
40327           20            4          1.034310                  1
12461            9            4          1.031647                  1
12495            9            5          1.043013                  1
34337           18            0          1.031868                  1

[14770 rows x 4 columns]
