In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

# Load dataset
df = pd.read_csv('yellow_tripdata_2015-01.csv')

# Display column names
print(df.columns)
print()

# Preprocessing
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], errors='coerce')
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'], errors='coerce')

# Add additional features to the original dataframe
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['pickup_date'] = df['tpep_pickup_datetime'].dt.date
df['pickup_day'] = df['tpep_pickup_datetime'].dt.dayofweek  # Monday=0, Sunday=6
df['is_weekend'] = df['pickup_day'].apply(lambda x: 1 if x >= 5 else 0)  # Weekend or not

# Group by date and hour to compute demand
demand_df = df.groupby(['pickup_date', 'pickup_hour', 'pickup_day', 'is_weekend']).size().reset_index(name='ride_count')

# Prepare features and target
features = ['pickup_hour', 'pickup_day', 'is_weekend']
X = demand_df[features]
y = demand_df['ride_count']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training and Evaluation
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, mae, r2

# Random Forest Model
rf_model = RandomForestRegressor(random_state=42)
rf_mse, rf_mae, rf_r2 = train_and_evaluate_model(rf_model, X_train, X_test, y_train, y_test)

# Gradient Boosting Model
gb_model = GradientBoostingRegressor(random_state=42)
gb_mse, gb_mae, gb_r2 = train_and_evaluate_model(gb_model, X_train, X_test, y_train, y_test)

# XGBoost Model
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_mse, xgb_mae, xgb_r2 = train_and_evaluate_model(xgb_model, X_train, X_test, y_train, y_test)

# Print Metrics
print("Model Performance Metrics:")
print(f"Random Forest -> MSE: {rf_mse}, MAE: {rf_mae}, R²: {rf_r2}")
print(f"Gradient Boosting -> MSE: {gb_mse}, MAE: {gb_mae}, R²: {gb_r2}")
print(f"XGBoost -> MSE: {xgb_mse}, MAE: {xgb_mae}, R²: {xgb_r2}")


Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'pickup_longitude',
       'pickup_latitude', 'RateCodeID', 'store_and_fwd_flag',
       'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount',
       'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount'],
      dtype='object')

Model Performance Metrics:
Random Forest -> MSE: 20977396.708558526, MAE: 2853.869679189938, R²: 0.7199108350229466
Gradient Boosting -> MSE: 19839517.058945052, MAE: 3081.5908158807833, R²: 0.7351037479154501
XGBoost -> MSE: 22220001.409389045, MAE: 2952.3006018312185, R²: 0.7033196687698364


In [26]:
from sklearn.tree import DecisionTreeRegressor

# Decision Tree Model
dt_model = DecisionTreeRegressor(criterion='squared_error', max_depth=None, 
                                 min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None)
dt_mse, dt_mae, dt_r2 = train_and_evaluate_model(dt_model, X_train, X_test, y_train, y_test)

# Print Metrics for Decision Tree
print(f"Decision Tree -> MSE: {dt_mse}, MAE: {dt_mae}, R²: {dt_r2}")


Decision Tree -> MSE: 22269423.649515294, MAE: 2956.8780760626396, R²: 0.7026597551083071


In [None]:
rf_model = RandomForestRegressor(n_estimators=100, criterion='squared_error', max_depth=None, 
                                 min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None, bootstrap=True)
rf_mse, rf_mae, rf_r2 = train_and_evaluate_model(rf_model, X_train, X_test, y_train, y_test)
print(f"Random Forest -> MSE: {rf_mse}, MAE: {rf_mae}, R²: {rf_r2}")

In [24]:

# Gradient Boosting Model
gb_model = GradientBoostingRegressor(loss='squared_error', learning_rate=0.1, n_estimators=100, subsample=1.0)
gb_mse, gb_mae, gb_r2 = train_and_evaluate_model(gb_model, X_train, X_test, y_train, y_test)
print(f"Gradient Boosting -> MSE: {gb_mse}, MAE: {gb_mae}, R²: {gb_r2}")

Gradient Boosting -> MSE: 19839517.05894505, MAE: 3081.5908158807833, R²: 0.7351037479154502
