## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
# Filter trips between 1 min and 2 hours
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

## Loading Dataset

In [1]:
import pandas as pd
df = pd.read_parquet('../data/yellow_tripdata_2024-01.parquet')

df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds()

df = df[(df['trip_duration'] >= 60) & (df['trip_duration'] <= 7200)]
y = df['trip_duration']

df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['pickup_weekday'] = df['tpep_pickup_datetime'].dt.weekday
df['is_weekend'] = df['pickup_weekday'].isin([5, 6]).astype(int)

# Trip distance (approximate Manhattan distance)
df['manhattan_dist'] = abs(df['PULocationID'] - df['DOLocationID'])

KeyboardInterrupt: 

In [None]:
def time_of_day(hour):
    if hour < 6: return 'night'
    elif hour < 12: return 'morning'
    elif hour < 18: return 'afternoon'
    else: return 'evening'

df['time_of_day'] = df['pickup_hour'].apply(time_of_day)
categorical_cols = ['VendorID', 'RatecodeID', 'store_and_fwd_flag', 'time_of_day']

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def prepare_features(X, categorical_cols, xgboost=False):
    """
    Prepares features for ML models.
    
    Args:
        X (pd.DataFrame): Input DataFrame
        categorical_cols (list): List of categorical column names
        xgboost (bool): If True, apply label encoding (for XGBoost). If False, apply one-hot encoding (for linear/NN)

    Returns:
        pd.DataFrame or (pd.DataFrame, dict): 
            - If xgboost=False: returns one-hot encoded DataFrame
            - If xgboost=True: returns label-encoded DataFrame and label encoders
    """
    X_copy = X.copy()
    
    if xgboost:
        label_encoders = {}
        for col in categorical_cols:
            le = LabelEncoder()
            X_copy[col] = le.fit_transform(X_copy[col].astype(str))
            label_encoders[col] = le
        return X_copy, label_encoders
    
    else:
        X_encoded = pd.get_dummies(X_copy, columns=categorical_cols, drop_first=True)
        return X_encoded


In [None]:
X = df[categorical_cols + ['pickup_hour', 'pickup_weekday', 'is_weekend', 'manhattan_dist',
                           'passenger_count', 'PULocationID', 'DOLocationID']]
X_encoded, label_encoders = prepare_features(X, categorical_cols, xgboost=True)

# Normalize numerical columns
numerical_cols = ['pickup_hour', 'pickup_weekday', 'manhattan_dist']
X_encoded[numerical_cols] = StandardScaler().fit_transform(X_encoded[numerical_cols])
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)