In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [2]:
# 1. Load all Excel files
fact = pd.read_csv("../Processed_Files/fact_trips.csv")
datetime = pd.read_csv("../Processed_Files/datetime_dim.csv")
pickup_loc = pd.read_csv("../Processed_Files/pickup_location_dim.csv")
dropoff_loc = pd.read_csv("../Processed_Files/dropoff_location_dim.csv")
payment = pd.read_csv("../Processed_Files/payment_type_dim.csv")
vendor = pd.read_csv("../Processed_Files/vendor_dim.csv")
rate_code = pd.read_csv("../Processed_Files/rate_code_dim.csv")

In [3]:
# 2. Merge all dimensions into the fact table
df = fact.merge(datetime, on='datetime_id', how='left') \
         .merge(pickup_loc, on='pickup_location_id', how='left') \
         .merge(dropoff_loc, on='dropoff_location_id', how='left') \
         .merge(payment, on='payment_type_id', how='left') \
         .merge(vendor, on='vendor_id', how='left') \
         .merge(rate_code, on='rate_code_id', how='left')

In [4]:
df.head()

Unnamed: 0,vendor_id,rate_code_id,payment_type_id,pickup_location_id,dropoff_location_id,datetime_id,passenger_count,trip_distance,fare_amount,extra,...,drop_weekday,pickup_borough,pickup_zone,pickup_service_zone,dropoff_borough,dropoff_zone,dropoff_service_zone,payment_type_description,vendor_name,rate_code_description
0,2,1,2,186,79,68719533120,1,1.72,17.7,1.0,...,Mon,Manhattan,Penn Station/Madison Sq West,Yellow Zone,Manhattan,East Village,Yellow Zone,Cash,VeriFone Inc.,Standard rate
1,1,1,1,140,236,51539649082,1,1.8,10.0,3.5,...,Mon,Manhattan,Lenox Hill East,Yellow Zone,Manhattan,Upper East Side North,Yellow Zone,Credit card,"Creative Mobile Technologies, LLC",Standard rate
2,1,1,1,236,79,25769827335,1,4.7,23.3,3.5,...,Mon,Manhattan,Upper East Side North,Yellow Zone,Manhattan,East Village,Yellow Zone,Credit card,"Creative Mobile Technologies, LLC",Standard rate
3,1,1,1,79,211,8589996090,1,1.4,10.0,3.5,...,Mon,Manhattan,East Village,Yellow Zone,Manhattan,SoHo,Yellow Zone,Credit card,"Creative Mobile Technologies, LLC",Standard rate
4,1,1,1,211,148,51539635018,1,0.8,7.9,3.5,...,Mon,Manhattan,SoHo,Yellow Zone,Manhattan,Lower East Side,Yellow Zone,Credit card,"Creative Mobile Technologies, LLC",Standard rate


In [5]:
df.size

106240362

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2795799 entries, 0 to 2795798
Data columns (total 38 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   vendor_id                 int64  
 1   rate_code_id              int64  
 2   payment_type_id           int64  
 3   pickup_location_id        int64  
 4   dropoff_location_id       int64  
 5   datetime_id               int64  
 6   passenger_count           int64  
 7   trip_distance             float64
 8   fare_amount               float64
 9   extra                     float64
 10  mta_tax                   float64
 11  tip_amount                float64
 12  tolls_amount              float64
 13  total_amount              float64
 14  congestion_surcharge      float64
 15  airport_fee               float64
 16  trip_id                   int64  
 17  pickup_datetime           object 
 18  dropoff_datetime          object 
 19  pick_hour                 int64  
 20  pick_day                

In [7]:
# 3. Select features and target
features = [
    'trip_distance', 'passenger_count',
    'pick_hour', 'pick_day', 'pick_month', 'pick_weekday',
    'vendor_name', 'rate_code_description', 'payment_type_description',
    'pickup_borough', 'pickup_zone', 'dropoff_borough', 'dropoff_zone'
]
target = 'fare_amount'

In [8]:
# 4. Drop NA and keep relevant data
df[features + [target]].isna().sum()

trip_distance                   0
passenger_count                 0
pick_hour                       0
pick_day                        0
pick_month                      0
pick_weekday                    0
vendor_name                     0
rate_code_description           0
payment_type_description        0
pickup_borough               1360
pickup_zone                 10167
dropoff_borough             11703
dropoff_zone                15811
fare_amount                     0
dtype: int64

In [9]:
df[['pickup_borough', 'pickup_zone', 'dropoff_borough', 'dropoff_zone']].dtypes

pickup_borough     object
pickup_zone        object
dropoff_borough    object
dropoff_zone       object
dtype: object

In [10]:
df[['pickup_borough', 'pickup_zone', 'dropoff_borough', 'dropoff_zone']].nunique()

pickup_borough       7
pickup_zone        250
dropoff_borough      7
dropoff_zone       258
dtype: int64

In [11]:
df[['pickup_borough', 'pickup_zone', 'dropoff_borough', 'dropoff_zone']].head(10)

Unnamed: 0,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone
0,Manhattan,Penn Station/Madison Sq West,Manhattan,East Village
1,Manhattan,Lenox Hill East,Manhattan,Upper East Side North
2,Manhattan,Upper East Side North,Manhattan,East Village
3,Manhattan,East Village,Manhattan,SoHo
4,Manhattan,SoHo,Manhattan,Lower East Side
5,Manhattan,Lower East Side,Manhattan,Lenox Hill West
6,Queens,LaGuardia Airport,Brooklyn,Park Slope
7,Manhattan,West Chelsea/Hudson Yards,Manhattan,TriBeCa/Civic Center
8,Manhattan,Midtown Center,Manhattan,World Trade Center
9,Manhattan,Greenwich Village North,Manhattan,Greenwich Village North


**Observation**

Since the categorical features contain missing values I will impute them using Unknown

In [12]:
df['pickup_borough'].fillna('Unknown', inplace=True)
df['pickup_zone'].fillna('Unknown', inplace=True)
df['dropoff_borough'].fillna('Unknown', inplace=True)
df['dropoff_zone'].fillna('Unknown', inplace=True)

In [13]:
df_model = df[features + [target]].copy()

In [14]:
df_model.isna().sum()

trip_distance               0
passenger_count             0
pick_hour                   0
pick_day                    0
pick_month                  0
pick_weekday                0
vendor_name                 0
rate_code_description       0
payment_type_description    0
pickup_borough              0
pickup_zone                 0
dropoff_borough             0
dropoff_zone                0
fare_amount                 0
dtype: int64

In [15]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2795799 entries, 0 to 2795798
Data columns (total 14 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   trip_distance             float64
 1   passenger_count           int64  
 2   pick_hour                 int64  
 3   pick_day                  int64  
 4   pick_month                int64  
 5   pick_weekday              object 
 6   vendor_name               object 
 7   rate_code_description     object 
 8   payment_type_description  object 
 9   pickup_borough            object 
 10  pickup_zone               object 
 11  dropoff_borough           object 
 12  dropoff_zone              object 
 13  fare_amount               float64
dtypes: float64(2), int64(4), object(8)
memory usage: 320.0+ MB


In [16]:
# 5. Define feature types
categorical_features = [
    'vendor_name', 'rate_code_description', 'payment_type_description',
    'pickup_borough', 'pickup_zone', 'dropoff_borough', 'dropoff_zone',
    'pick_weekday'  # <== FIX: move here
]

numeric_features = [
    'trip_distance', 'passenger_count', 'pick_hour', 'pick_day', 'pick_month'
]

In [17]:
# 6. Preprocessing: One-hot encode categoricals, scale numerics
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numeric_features)
])

In [21]:
# 7. Define model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [22]:
# 8. Train-test split
X = df_model[features]
y = df_model[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 9. Fit the model
model.fit(X_train, y_train)

In [None]:
# 10. Predict and evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"✅ RMSE: {rmse:.2f}")
print(f"✅ R² Score: {r2:.2f}")

In [None]:
import joblib

# Save pipeline to a file
joblib.dump(model, 'fare_amount_prediction_model.pkl')

print("✅ Model saved as 'fare_amount_prediction_model.pkl'")

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from xgboost import XGBRegressor

In [19]:
# Sample data
df_sample = df[features + [target]].sample(n=100_000, random_state=42).copy()

In [20]:
X = df_sample[features]
y = df_sample[target]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Define feature types
categorical_features = [
    'vendor_name', 'rate_code_description', 'payment_type_description',
    'pickup_borough', 'pickup_zone', 'dropoff_borough', 'dropoff_zone',
    'pick_weekday'
]

numeric_features = [
    'trip_distance', 'passenger_count', 'pick_hour', 'pick_day', 'pick_month'
]

In [23]:
# Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numeric_features)
])

In [27]:
# Models to try
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, tree_method="gpu_hist", predictor="gpu_predictor", random_state=42)
}

In [28]:
# Evaluate all models
results = []

In [29]:
for name, reg in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', reg)
    ])
    
    print(f"\n⏳ Training {name}...")
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print(f"✅ {name} → RMSE: {rmse:.2f}, R²: {r2:.4f}")
    results.append((name, rmse, r2))


⏳ Training Linear Regression...
✅ Linear Regression → RMSE: 8.51, R²: 0.7846

⏳ Training Random Forest...
✅ Random Forest → RMSE: 7.72, R²: 0.8227

⏳ Training Gradient Boosting...
✅ Gradient Boosting → RMSE: 7.65, R²: 0.8259

⏳ Training XGBoost...
✅ XGBoost → RMSE: 8.12, R²: 0.8037


In [30]:
# Sort and display results
results.sort(key=lambda x: x[1])  # Sort by RMSE
print("\n📊 Model Comparison:")

for name, rmse, r2 in results:
    print(f"{name:<25} | RMSE: {rmse:.2f} | R²: {r2:.4f}")


📊 Model Comparison:
Gradient Boosting         | RMSE: 7.65 | R²: 0.8259
Random Forest             | RMSE: 7.72 | R²: 0.8227
XGBoost                   | RMSE: 8.12 | R²: 0.8037
Linear Regression         | RMSE: 8.51 | R²: 0.7846


In [31]:
best_model_name = "Gradient Boosting"

In [32]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib

# Reuse these
categorical_features = [
    'vendor_name', 'rate_code_description', 'payment_type_description',
    'pickup_borough', 'pickup_zone', 'dropoff_borough', 'dropoff_zone',
    'pick_weekday'
]
numeric_features = [
    'trip_distance', 'passenger_count', 'pick_hour', 'pick_day', 'pick_month'
]

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numeric_features)
])

# Final model pipeline
final_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

In [33]:
# Fit on full sampled dataset
final_model.fit(X_train, y_train)

In [34]:

# Save to file
joblib.dump(final_model, 'best_fare_prediction_model.pkl')
print("✅ Best model saved as 'best_fare_prediction_model.pkl'")


✅ Best model saved as 'best_fare_prediction_model.pkl'


In [35]:
import joblib
import pandas as pd

# 1. Load the saved model
model = joblib.load('best_fare_prediction_model.pkl')

# 2. (Optional) Test on your existing test set
# Make predictions
y_pred = model.predict(X_test)

# Compare predictions with actual values
for actual, pred in zip(y_test[:10], y_pred[:10]):
    print(f"🎯 Actual: {actual:.2f} | 🔮 Predicted: {pred:.2f}")

🎯 Actual: 19.10 | 🔮 Predicted: 11.01
🎯 Actual: 13.50 | 🔮 Predicted: 12.56
🎯 Actual: 9.30 | 🔮 Predicted: 13.06
🎯 Actual: 53.40 | 🔮 Predicted: 57.85
🎯 Actual: 28.90 | 🔮 Predicted: 18.01
🎯 Actual: 16.30 | 🔮 Predicted: 15.73
🎯 Actual: 24.00 | 🔮 Predicted: 16.57
🎯 Actual: 18.40 | 🔮 Predicted: 14.97
🎯 Actual: 27.50 | 🔮 Predicted: 28.13
🎯 Actual: 70.00 | 🔮 Predicted: 69.97


In [36]:
# 3. Create a sample input row (replace with actual values)
sample_input = pd.DataFrame([{
    'trip_distance': 2.5,
    'passenger_count': 1,
    'pick_hour': 14,
    'pick_day': 12,
    'pick_month': 5,
    'pick_weekday': 'Mon',
    'vendor_name': 'VTS',
    'rate_code_description': 'Standard rate',
    'payment_type_description': 'Credit card',
    'pickup_borough': 'Manhattan',
    'pickup_zone': 'Midtown Center',
    'dropoff_borough': 'Manhattan',
    'dropoff_zone': 'Upper East Side South'
}])

In [38]:

# 4. Predict fare
predicted_fare = model.predict(sample_input)
print(f"💰 Predicted Fare Amount: ${predicted_fare[0]:.2f}")

💰 Predicted Fare Amount: $16.57
