## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
# Filter trips between 1 min and 2 hours
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

## Loading Dataset

In [1]:
import pandas as pd
df = pd.read_parquet('../data/yellow_tripdata_2024-01.parquet')

df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds()

df = df[(df['trip_duration'] >= 60) & (df['trip_duration'] <= 7200)]
y = df['trip_duration']

df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['pickup_weekday'] = df['tpep_pickup_datetime'].dt.weekday
df['is_weekend'] = df['pickup_weekday'].isin([5, 6]).astype(int)

# Trip distance (approximate Manhattan distance)
df['manhattan_dist'] = abs(df['PULocationID'] - df['DOLocationID'])

KeyboardInterrupt: 

In [None]:
def time_of_day(hour):
    if hour < 6: return 'night'
    elif hour < 12: return 'morning'
    elif hour < 18: return 'afternoon'
    else: return 'evening'

df['time_of_day'] = df['pickup_hour'].apply(time_of_day)
categorical_cols = ['VendorID', 'RatecodeID', 'store_and_fwd_flag', 'time_of_day']

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def prepare_features(X, categorical_cols, xgboost=False):
    """
    Prepares features for ML models.
    
    Args:
        X (pd.DataFrame): Input DataFrame
        categorical_cols (list): List of categorical column names
        xgboost (bool): If True, apply label encoding (for XGBoost). If False, apply one-hot encoding (for linear/NN)

    Returns:
        pd.DataFrame or (pd.DataFrame, dict): 
            - If xgboost=False: returns one-hot encoded DataFrame
            - If xgboost=True: returns label-encoded DataFrame and label encoders
    """
    X_copy = X.copy()
    
    if xgboost:
        label_encoders = {}
        for col in categorical_cols:
            le = LabelEncoder()
            X_copy[col] = le.fit_transform(X_copy[col].astype(str))
            label_encoders[col] = le
        return X_copy, label_encoders
    
    else:
        X_encoded = pd.get_dummies(X_copy, columns=categorical_cols, drop_first=True)
        return X_encoded


In [None]:
X = df[categorical_cols + ['pickup_hour', 'pickup_weekday', 'is_weekend', 'manhattan_dist',
                           'passenger_count', 'PULocationID', 'DOLocationID']]
X_encoded, label_encoders = prepare_features(X, categorical_cols, xgboost=True)

# Normalize numerical columns
numerical_cols = ['pickup_hour', 'pickup_weekday', 'manhattan_dist']
X_encoded[numerical_cols] = StandardScaler().fit_transform(X_encoded[numerical_cols])
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

## <b> XG

In [None]:
xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42,
    early_stopping_rounds=20,
    eval_metric="rmse"  # <- move this here
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

In [None]:
y_pred = xgb_model.predict(X_val)
from math import sqrt
rmse = sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print(f"XGBoost Tuned - RMSE: {rmse:.2f}, R²: {r2:.4f}")

## <b1> RF

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# Initialize the RandomForestRegressor model
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=8,
    random_state=42
)

# Train the model using the training data
rf_model.fit(X_train, y_train)

In [None]:
# Predict on the validation set
y_pred = rf_model.predict(X_val)

# Calculate evaluation metrics
rmse = sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

# Print evaluation metrics
print(f"Random Forest - RMSE: {rmse:.2f}, R²: {r2:.4f}")

## <b1> SVM

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# Impute missing values with the mean strategy
imputer = SimpleImputer(strategy='mean')  # You can also use 'median' or other strategies
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)

# Initialize the SVR model
svr_model = SVR(kernel='linear', C=1, gamma='auto', epsilon=0.1)

# Train the model using the training data
svr_model.fit(X_train_imputed, y_train)


In [None]:
# Predict on the validation set
y_pred = svr_model.predict(X_val)

# Calculate evaluation metrics
rmse = sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

# Print evaluation metrics
print(f"SVR - RMSE: {rmse:.2f}, R²: {r2:.4f}")

In [None]:
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt


imputer = SimpleImputer(strategy='mean')  # You can also use 'median' or other strategies
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)

ridge = Ridge(alpha=1.0)
ridge.fit(X_train_imputed, y_train)
y_pred = ridge.predict(X_val_imputed)

rmse = sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

# Print evaluation metrics
print(f"Ridge - RMSE: {rmse:.2f}, R²: {r2:.4f}")


In [None]:
from sklearn.linear_model import ElasticNet

enet = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
enet.fit(X_train_imputed, y_train)
y_pred = enet.predict(X_val_imputed)

rmse = sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)
print(f"ElasticNet - RMSE: {rmse:.2f}, R²: {r2:.4f}")


In [None]:
from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor(loss='squared_error', penalty='l2', max_iter=1000, tol=1e-3, random_state=42)
sgd.fit(X_train_imputed, y_train)
y_pred = sgd.predict(X_val_imputed)

rmse = sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)
print(f"SGD Regressor - RMSE: {rmse:.2f}, R²: {r2:.4f}")


In [None]:
from sklearn.linear_model import PassiveAggressiveRegressor

pa = PassiveAggressiveRegressor(random_state=42, max_iter=1000, tol=1e-3)
pa.fit(X_train_imputed, y_train)
y_pred = pa.predict(X_val_imputed)

rmse = sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)
print(f"Passive Aggressive - RMSE: {rmse:.2f}, R²: {r2:.4f}")
