In [1]:
import pandas as pd
from tabulate import tabulate

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [18]:
agg_filepath = 'filepath/aggregated_fhvhv_tripdata_2024-06.csv'
agg_df = pd.read_csv(agg_filepath)

In [19]:
# Confirming data types are correct
agg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   pickup_date           720 non-null    object 
 1   pickup_hour           720 non-null    int64  
 2   is_weekend            720 non-null    int64  
 3   ride_count            720 non-null    int64  
 4   Temperature (F)       720 non-null    int64  
 5   Wind Speed (mph)      720 non-null    int64  
 6   Precip. (in)          720 non-null    float64
 7   Condition_All_Cloudy  720 non-null    int64  
 8   Condition_All_Fair    720 non-null    int64  
 9   Condition_All_Fog     720 non-null    int64  
 10  Condition_All_Rain    720 non-null    int64  
 11  Condition_All_Storm   720 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 67.6+ KB


In [20]:
# Convert 'pickup_date' to datetime format
agg_df['pickup_date'] = pd.to_datetime(agg_df['pickup_date'])

# Decided day of week, and day of month to be better features than 'is_weekend'
# Extract day, day of the week, and month from 'pickup_date'
agg_df['day_of_month'] = agg_df['pickup_date'].dt.day
agg_df['day_of_week'] = agg_df['pickup_date'].dt.dayofweek

# Drop the redundant columns
agg_df = agg_df.drop(columns=['pickup_date', 'is_weekend'])

# Define the feature matrix (X) and the target vector (y)
X = agg_df.drop(columns=['ride_count'])
y = agg_df['ride_count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Mean Squared Error: 2910025.33
R^2 Score: 0.97


In [13]:
# Check which features are most important
feature_importances = rf_model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(print(tabulate(importance_df, headers='keys')))

    Feature                 Importance
--  --------------------  ------------
 0  pickup_hour            0.778395
10  day_of_week            0.176905
 1  Temperature (F)        0.0153234
 9  day_of_month           0.0144242
 2  Wind Speed (mph)       0.00969672
 7  Condition_All_Rain     0.00194023
 6  Condition_All_Fog      0.00118242
 5  Condition_All_Fair     0.000992567
 4  Condition_All_Cloudy   0.00099184
 3  Precip. (in)           0.000126894
 8  Condition_All_Storm    2.17364e-05
None


In [14]:
# Cross-validating model
cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='r2')
print(f"Cross-validated R-squared scores: {cv_scores}")
print(f"Mean R-squared score: {cv_scores.mean()}")

Cross-validated R-squared scores: [0.97100246 0.97862778 0.95121257 0.9257694  0.92426305]
Mean R-squared score: 0.9501750532625156


In [16]:
# Testing model with data from previous year (June 2023)
historical_filepath = 'filepath/aggregated_fhvhv_tripdata_2023-06.csv'
historical_df = pd.read_csv(historical_filepath)

In [17]:
# Convert 'pickup_date' to datetime format
historical_df['pickup_date'] = pd.to_datetime(historical_df['pickup_date'])

# Extract day of the month and day of the week from 'pickup_date'
historical_df['day_of_month'] = historical_df['pickup_date'].dt.day
historical_df['day_of_week'] = historical_df['pickup_date'].dt.dayofweek

# Drop the original 'pickup_date' and 'is_weekend' columns since they're not needed
historical_df = historical_df.drop(columns=['pickup_date', 'is_weekend'])

# Define the feature matrix (X) for the historical dataset
X_historical = historical_df.drop(columns=['ride_count'])

# Use the trained model to predict ride count on the historical dataset
y_historical_pred = rf_model.predict(X_historical)

# Calculate the Mean Squared Error and R^2 score for the historical dataset
mse_historical = mean_squared_error(historical_df['ride_count'], y_historical_pred)
r2_historical = r2_score(historical_df['ride_count'], y_historical_pred)

print(f"Mean Squared Error on Historical Data: {mse_historical:.2f}")
print(f"R^2 Score on Historical Data: {r2_historical:.2f}")

Mean Squared Error on Historical Data: 6148060.77
R^2 Score on Historical Data: 0.94
