## Importing libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from xgboost import XGBRegressor

## Loading data

In [2]:
fact = pd.read_csv("../Processed_Files/fact_trips.csv")
datetime = pd.read_csv("../Processed_Files/datetime_dim.csv")
pickup_loc = pd.read_csv("../Processed_Files/pickup_location_dim.csv")
dropoff_loc = pd.read_csv("../Processed_Files/dropoff_location_dim.csv")
payment = pd.read_csv("../Processed_Files/payment_type_dim.csv")
vendor = pd.read_csv("../Processed_Files/vendor_dim.csv")
rate_code = pd.read_csv("../Processed_Files/rate_code_dim.csv")

**Merging Data**

In [3]:
df = fact.merge(datetime, on='datetime_id', how='left') \
         .merge(pickup_loc, on='pickup_location_id', how='left') \
         .merge(dropoff_loc, on='dropoff_location_id', how='left') \
         .merge(payment, on='payment_type_id', how='left') \
         .merge(vendor, on='vendor_id', how='left') \
         .merge(rate_code, on='rate_code_id', how='left')

In [4]:
df.columns

Index(['vendor_id', 'rate_code_id', 'payment_type_id', 'pickup_location_id',
       'dropoff_location_id', 'datetime_id', 'passenger_count',
       'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
       'tolls_amount', 'total_amount', 'congestion_surcharge', 'airport_fee',
       'trip_id', 'pickup_datetime', 'dropoff_datetime', 'pick_hour',
       'pick_day', 'pick_month', 'pick_year', 'pick_weekday', 'drop_hour',
       'drop_day', 'drop_month', 'drop_year', 'drop_weekday', 'pickup_borough',
       'pickup_zone', 'pickup_service_zone', 'dropoff_borough', 'dropoff_zone',
       'dropoff_service_zone', 'payment_type_description', 'vendor_name',
       'rate_code_description'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,vendor_id,rate_code_id,payment_type_id,pickup_location_id,dropoff_location_id,datetime_id,passenger_count,trip_distance,fare_amount,extra,...,drop_weekday,pickup_borough,pickup_zone,pickup_service_zone,dropoff_borough,dropoff_zone,dropoff_service_zone,payment_type_description,vendor_name,rate_code_description
0,2,1,2,186,79,68719533120,1,1.72,17.7,1.0,...,Mon,Manhattan,Penn Station/Madison Sq West,Yellow Zone,Manhattan,East Village,Yellow Zone,Cash,VeriFone Inc.,Standard rate
1,1,1,1,140,236,51539649082,1,1.8,10.0,3.5,...,Mon,Manhattan,Lenox Hill East,Yellow Zone,Manhattan,Upper East Side North,Yellow Zone,Credit card,"Creative Mobile Technologies, LLC",Standard rate
2,1,1,1,236,79,25769827335,1,4.7,23.3,3.5,...,Mon,Manhattan,Upper East Side North,Yellow Zone,Manhattan,East Village,Yellow Zone,Credit card,"Creative Mobile Technologies, LLC",Standard rate
3,1,1,1,79,211,8589996090,1,1.4,10.0,3.5,...,Mon,Manhattan,East Village,Yellow Zone,Manhattan,SoHo,Yellow Zone,Credit card,"Creative Mobile Technologies, LLC",Standard rate
4,1,1,1,211,148,51539635018,1,0.8,7.9,3.5,...,Mon,Manhattan,SoHo,Yellow Zone,Manhattan,Lower East Side,Yellow Zone,Credit card,"Creative Mobile Technologies, LLC",Standard rate


In [6]:
print("Shape: ", df.shape)
print("Size: ", df.size)

Shape:  (2795799, 38)
Size:  106240362


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2795799 entries, 0 to 2795798
Data columns (total 38 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   vendor_id                 int64  
 1   rate_code_id              int64  
 2   payment_type_id           int64  
 3   pickup_location_id        int64  
 4   dropoff_location_id       int64  
 5   datetime_id               int64  
 6   passenger_count           int64  
 7   trip_distance             float64
 8   fare_amount               float64
 9   extra                     float64
 10  mta_tax                   float64
 11  tip_amount                float64
 12  tolls_amount              float64
 13  total_amount              float64
 14  congestion_surcharge      float64
 15  airport_fee               float64
 16  trip_id                   int64  
 17  pickup_datetime           object 
 18  dropoff_datetime          object 
 19  pick_hour                 int64  
 20  pick_day                

In [8]:
# Total number of rows in dataset
total_rows = len(df)

# Number of rows with negative total_amount
negative_rows_count = len(df[df['total_amount'] < 0.0])

# Calculate percentage
percentage_negative = (negative_rows_count / total_rows) * 100

# Display result
print(f"Rows with negative total_amount: {negative_rows_count}")
print(f"Percentage of negative total_amount rows: {percentage_negative:.4f}%")

Rows with negative total_amount: 35502
Percentage of negative total_amount rows: 1.2698%


In [9]:
# Update total_amount to 0 where it's negative
df.loc[df['total_amount'] < 0.0, 'total_amount'] = 0.0

# Confirm the change (check if any negative values remain)
print(f"Remaining negative total_amount values: {(df['total_amount'] < 0.0).sum()}")

Remaining negative total_amount values: 0


In [10]:
# Total number of rows in dataset
total_rows = len(df)

# Number of rows with negative total_amount
negative_rows_count = len(df[df['total_amount'] < 0.0])

# Calculate percentage
percentage_negative = (negative_rows_count / total_rows) * 100

# Display result
print(f"Rows with negative total_amount: {negative_rows_count}")
print(f"Percentage of negative total_amount rows: {percentage_negative:.4f}%")

Rows with negative total_amount: 0
Percentage of negative total_amount rows: 0.0000%


In [11]:
# 3. Select features and target
features = ['trip_distance', 'passenger_count', 'pick_hour', 'pick_day', 'pick_month', 'drop_hour', 'drop_day', 'drop_month']
target = 'total_amount'

In [12]:
# 4. Drop NA and keep relevant data
df[features + [target]].isna().sum()

trip_distance      0
passenger_count    0
pick_hour          0
pick_day           0
pick_month         0
drop_hour          0
drop_day           0
drop_month         0
total_amount       0
dtype: int64

In [13]:
# Sample data
df_sample = df[features + [target]].sample(n=100_000, random_state=42).copy()

In [14]:
X = df_sample[features]
y = df_sample[target]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# 6. Preprocessing: One-hot encode categoricals, scale numerics
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), features)
])

In [17]:
# Models to try
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, tree_method="gpu_hist", predictor="gpu_predictor", random_state=42)
}

In [18]:
# Evaluate all models
results = []

In [19]:
for name, reg in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', reg)
    ])
    
    print(f"\n Training {name}...")
    pipe.fit(X_train, y_train)
    
    # Predictions on training set
    y_train_pred = pipe.predict(X_train)

    # Metrics on training set
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_r2 = r2_score(y_train, y_train_pred)

    print(f"{name} → Train RMSE: {train_rmse:.2f}, Train R²: {train_r2:.4f}")
    
    y_pred = pipe.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print(f"{name} → RMSE: {rmse:.2f}, R²: {r2:.4f}")
    results.append((name, rmse, r2))


 Training Linear Regression...
Linear Regression → Train RMSE: 10.07, Train R²: 0.8062
Linear Regression → RMSE: 9.53, R²: 0.8176

 Training Random Forest...
Random Forest → Train RMSE: 4.55, Train R²: 0.9605
Random Forest → RMSE: 9.79, R²: 0.8073

 Training Gradient Boosting...
Gradient Boosting → Train RMSE: 8.97, Train R²: 0.8463
Gradient Boosting → RMSE: 8.93, R²: 0.8397

 Training XGBoost...
XGBoost → Train RMSE: 8.00, Train R²: 0.8775
XGBoost → RMSE: 9.77, R²: 0.8083


In [20]:
# Sort and display results
results.sort(key=lambda x: x[1])  # Sort by RMSE
print("\nModel Comparison:")

for name, rmse, r2 in results:
    print(f"{name:<25} | RMSE: {rmse:.2f} | R²: {r2:.4f}")


Model Comparison:
Gradient Boosting         | RMSE: 8.93 | R²: 0.8397
Linear Regression         | RMSE: 9.53 | R²: 0.8176
XGBoost                   | RMSE: 9.77 | R²: 0.8083
Random Forest             | RMSE: 9.79 | R²: 0.8073


In [21]:
# Remaining unseen data (excluding sampled data)
df_unseen = df.drop(df_sample.index)

In [22]:
X_unseen = df_unseen[features]
y_unseen = df_unseen[target]

In [23]:
from sklearn.metrics import mean_squared_error, r2_score

# Re-create preprocessing pipeline for Gradient Boosting
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

# Train on the sampled training data
pipe.fit(X_train, y_train)

# Predict on unseen data
y_unseen_pred = pipe.predict(X_unseen)

# Evaluate
unseen_rmse = np.sqrt(mean_squared_error(y_unseen, y_unseen_pred))
unseen_r2 = r2_score(y_unseen, y_unseen_pred)

print(f"Gradient Boosting on Unseen Data → RMSE: {unseen_rmse:.2f}, R²: {unseen_r2:.4f}")

Gradient Boosting on Unseen Data → RMSE: 9.35, R²: 0.8265


In [24]:
import pickle

# After fitting the pipeline on your sampled training data:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

pipe.fit(X_train, y_train)

# Save the trained model to a pickle file
with open('gradient_boosting_model.pkl', 'wb') as f:
    pickle.dump(pipe, f)

print("Model saved as 'gradient_boosting_model.pkl'")

Model saved as 'gradient_boosting_model.pkl'
