In [18]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [22]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,target
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2023-01-29,2,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-30,2,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-31,2,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-02-01,2,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-02-02,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87615,25,14,5,3,7,16,53,133,126,136,...,62,62,58,50,48,42,37,2023-12-27,263,12
87616,30,7,9,6,5,23,58,123,136,108,...,64,79,65,71,72,75,35,2023-12-28,263,19
87617,50,26,17,9,8,11,43,116,137,132,...,81,78,60,85,63,62,37,2023-12-29,263,38
87618,117,88,39,19,14,12,27,37,70,97,...,84,75,100,98,88,77,69,2023-12-30,263,59


In [24]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(55900, 674)
(55900,)
(31720, 674)
(31720,)


In [26]:
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns]
X_test_only_numeric = X_test[past_ride_columns]

In [28]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [
            f"rides_t-{7*24}",  # 1 week ago
            f"rides_t-{14*24}", # 2 weeks ago
            f"rides_t-{21*24}", # 3 weeks ago
            f"rides_t-{28*24}"  # 4 weeks ago
        ]

        # Ensure the required columns exist in the test DataFrame
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")

    # Calculate the average of the last 4 weeks
    X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)

    return X

In [30]:
from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False
)

In [32]:
add_feature_average_rides_last_4_weeks.fit_transform(X_train)

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,average_rides_last_4_weeks
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2023-01-29,2,0.00
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-30,2,0.00
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-31,2,0.00
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-02-01,2,0.00
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-02-02,2,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55895,90,64,43,20,13,11,16,29,48,61,...,91,97,101,80,94,100,69,2023-08-27,263,73.25
55896,18,12,5,5,4,22,48,74,95,109,...,80,80,72,91,67,53,26,2023-08-28,263,15.75
55897,12,11,10,5,4,15,44,97,118,107,...,79,84,69,80,57,37,20,2023-08-29,263,12.25
55898,26,11,8,0,8,10,44,97,126,121,...,92,91,95,72,68,63,30,2023-08-30,263,24.75


In [34]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour"] = X_["pickup_hour"].dt.hour
        X_["day_of_week"] = X_["pickup_hour"].dt.dayofweek

        return X_.drop(columns=["pickup_hour", "location_id"])

In [36]:
add_temporal_features = TemporalFeatureEngineer()
add_temporal_features.fit_transform(X_train)

KeyError: "['location_id'] not found in axis"

In [None]:
import lightgbm as lgb

from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)



In [None]:
X_train

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error
predictions = pipeline.predict(X_test)

In [None]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")

In [None]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os
load_dotenv() 

mlflow = set_mlflow_tracking()
log_model_to_mlflow(pipeline, X_test, "LGBMRegressorWFE", "mean_absolute_error", score=test_mae)

In [40]:
# Enable autoreload for code updates
%load_ext autoreload
%autoreload 2

import sys
import os
import mlflow
import mlflow.lightgbm
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from dotenv import load_dotenv

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# Load dataset
from src.config import TRANSFORMED_DATA_DIR
from src.data_utils import split_time_series_data
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")

# Train-test split
X_train, y_train, X_test, y_test = split_time_series_data(
    df, cutoff_date=datetime(2023, 9, 1, 0, 0, 0), target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# Select numeric features
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns].astype("float64")
X_test_only_numeric = X_test[past_ride_columns].astype("float64")

# Feature Engineering: Average rides over last 4 weeks
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [
        f"rides_t-{7*24}",  # 1 week ago
        f"rides_t-{14*24}", # 2 weeks ago
        f"rides_t-{21*24}", # 3 weeks ago
        f"rides_t-{28*24}"  # 4 weeks ago
    ]

    # Ensure the required columns exist in the test DataFrame
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")

    # Calculate the average of the last 4 weeks
    X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)

    return X

# Create transformer
add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False
)

# Temporal Feature Engineering
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour"] = X_["pickup_hour"].dt.hour
        X_["day_of_week"] = X_["pickup_hour"].dt.dayofweek

        # Drop only columns that exist in the DataFrame
        cols_to_drop = ["pickup_hour", "location_id"]
        existing_cols = [col for col in cols_to_drop if col in X_.columns]

        return X_.drop(columns=existing_cols, errors="ignore")  # `errors="ignore"` prevents crashes




# Set up MLflow tracking
load_dotenv() 
mlflow = set_mlflow_tracking()
mlflow.set_experiment("NYC_Taxi_LightGBM_Feature_Engineering")

# Step 1: Tune learning_rate
learning_rates = [0.01, 0.05, 0.1, 0.2, 0.3]
best_mae = float("inf")
best_lr = None

for lr in learning_rates:
    with mlflow.start_run(nested=True):  # Nested run for MLflow logging
        model = lgb.LGBMRegressor(learning_rate=lr, num_leaves=31, n_estimators=100)
        
        pipeline = make_pipeline(
            add_feature_average_rides_last_4_weeks,
            add_temporal_features,
            model
        )
        
        pipeline.fit(X_train, y_train)
        predictions = pipeline.predict(X_test)
        test_mae = mean_absolute_error(y_test, predictions)

        mlflow.log_param("learning_rate", lr)
        mlflow.log_metric("test_MAE", test_mae)

        print(f"Learning Rate: {lr} | MAE: {test_mae}")

        if test_mae < best_mae:
            best_mae = test_mae
            best_lr = lr

print(f"Best Learning Rate: {best_lr}")

# Step 2: Tune num_leaves & n_estimators with best learning_rate
num_leaves_options = [31, 50, 100]
n_estimators_options = [50, 100, 200]
best_model = None

for num_leaves in num_leaves_options:
    for n_estimators in n_estimators_options:
        with mlflow.start_run(nested=True):  # Nested run for MLflow logging
            model = lgb.LGBMRegressor(learning_rate=best_lr, num_leaves=num_leaves, n_estimators=n_estimators)
            
            pipeline = make_pipeline(
                add_feature_average_rides_last_4_weeks,
                add_temporal_features,
                model
            )

            pipeline.fit(X_train, y_train)
            predictions = pipeline.predict(X_test)
            test_mae = mean_absolute_error(y_test, predictions)

            mlflow.log_params({"num_leaves": num_leaves, "n_estimators": n_estimators})
            mlflow.log_metric("test_MAE", test_mae)

            print(f"Num Leaves: {num_leaves} | N Estimators: {n_estimators} | MAE: {test_mae}")

            if test_mae < best_mae:
                best_mae = test_mae
                best_model = pipeline

print("Final best MAE:", best_mae)

# Log best model to MLflow
log_model_to_mlflow(best_model, X_test, "LGBMRegressorWFE", "mean_absolute_error", score=best_mae)

# Feature Importance Plot
plt.figure(figsize=(10, 6))
lgb.plot_importance(best_model.named_steps["lgbmregressor"], importance_type="gain")
plt.title("Feature Importance")
plt.show()


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


INFO:src.experiment_utils:MLflow tracking URI set to: https://dagshub.com/Sudeepthi-Rongali/Taxi_NYC.mlflow


(55900, 674)
(55900,)
(31720, 674)
(31720,)
🏃 View run learned-mink-493 at: https://dagshub.com/Sudeepthi-Rongali/Taxi_NYC.mlflow/#/experiments/3/runs/ae43a18438634a8da871d4ff85a959e5
🧪 View experiment at: https://dagshub.com/Sudeepthi-Rongali/Taxi_NYC.mlflow/#/experiments/3


KeyError: "['location_id'] not found in axis"