In [None]:
import numpy as np
import pandas as pd
import data_clean_utils
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder
from sklearn.model_selection import train_test_split
import joblib

In [None]:
import dagshub
dagshub.init(repo_owner='Ranjeet-Kumar60', repo_name='swiggy-delivery-time-prediction1', mlflow=True)

In [None]:
import mlflow

In [None]:
# set the tracking server

mlflow.set_tracking_uri("https://dagshub.com/Ranjeet-Kumar60/swiggy-delivery-time-prediction1.mlflow")

In [None]:
# mlflow experiment

mlflow.set_experiment("Exp 4 - LGBM Hyperparameters Tuning")

In [None]:
from sklearn import set_config

set_config(transform_output="pandas")

# Load the Data

In [None]:
df = pd.read_csv("../data/raw/swiggy.csv")
df

# Clean Data

In [None]:
swiggy_cleaned=data_clean_utils.perform_data_cleaning(df)

In [None]:
swiggy_cleaned

In [None]:
swiggy_cleaned.columns

In [None]:
# drop columns not required for model input

columns_to_drop =  ['rider_id',
                    'restaurant_latitude',
                    'restaurant_longitude',
                    'delivery_latitude',
                    'delivery_longitude',
                    'order_date',
                    "order_time_hour",
                    "order_day",
                    "city_name",
                    "order_day_of_week",
                    "order_month"]

swiggy_cleaned.drop(columns=columns_to_drop, inplace=True)

swiggy_cleaned

In [None]:
# check for missing values

swiggy_cleaned.isna().sum()

In [None]:
# check for duplicates

swiggy_cleaned.duplicated().sum()

In [None]:
import missingno as msno

msno.matrix(swiggy_cleaned)

In [None]:
# columns that have missing values

missing_cols = (
                    swiggy_cleaned
                    .isna()
                    .any(axis=0)
                    .loc[lambda x: x]
                    .index
                )

missing_cols

# Drop Missing values

In [None]:
temp_df = swiggy_cleaned.copy().dropna()

In [None]:
# split into X and y

X = temp_df.drop(columns='time_taken')
y = temp_df['time_taken']

X

In [None]:
# train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
print("The size of train data is",X_train.shape)
print("The shape of test data is",X_test.shape)

In [None]:
# missing values in train data

X_train.isna().sum()

In [None]:
# transform target column

pt = PowerTransformer()

y_train_pt = pt.fit_transform(y_train.values.reshape(-1,1))
y_test_pt = pt.transform(y_test.values.reshape(-1,1))

In [None]:
missing_cols

In [None]:
# percentage of rows in data having missing values

(
    X_train
    .isna()
    .any(axis=1)
    .mean()
    .round(2) * 100
)


# Pre-Processing Pipeline

In [None]:
# do basic preprocessing

num_cols = ["age","ratings","pickup_time_minutes","distance"]

nominal_cat_cols = ['weather','type_of_order',
                    'type_of_vehicle',"festival",
                    "city_type",
                    "is_weekend",
                    "order_time_of_day"]

ordinal_cat_cols = ["traffic","distance_type"]

In [None]:
# generate order for ordinal encoding

traffic_order = ["low","medium","high","jam"]

distance_type_order = ["short","medium","long","very_long"]

In [None]:
# unique categories the ordinal columns

for col in ordinal_cat_cols:
    print(col,X_train[col].unique())

In [None]:
# build a preprocessor

preprocessor = ColumnTransformer(transformers=[
    ("scale", MinMaxScaler(), num_cols),
    ("nominal_encode", OneHotEncoder(drop="first",handle_unknown="ignore",
                                     sparse_output=False), nominal_cat_cols),
    ("ordinal_encode", OrdinalEncoder(categories=[traffic_order,distance_type_order],
                                      encoded_missing_value=-999,
                                      handle_unknown="use_encoded_value",
                                      unknown_value=-1), ordinal_cat_cols)
],remainder="passthrough",n_jobs=-1,force_int_remainder_cols=False,verbose_feature_names_out=False)


preprocessor

In [None]:
# build the pipeline

processing_pipeline = Pipeline(steps=[
                                ("preprocess",preprocessor)
                            ])

processing_pipeline

In [None]:
# do data preprocessing

X_train_trans = processing_pipeline.fit_transform(X_train)

X_test_trans = processing_pipeline.transform(X_test)

In [None]:
X_train_trans

In [None]:
from lightgbm import LGBMRegressor
import optuna

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor

In [None]:
def objective(trial):
    with mlflow.start_run(nested=True):
        params = {
            "n_estimators": trial.suggest_int("n_estimators",10,200),
            "max_depth": trial.suggest_int("max_depth",1,40),
            "learning_rate": trial.suggest_float("learning_rate",0.1,0.8),
            "subsample": trial.suggest_float("subsample",0.5,1),
            "min_child_weight": trial.suggest_int("min_child_weight",5,20),
            "min_split_gain": trial.suggest_float("min_split_gain",0,10),
            "reg_lambda": trial.suggest_float("reg_lambda",0,100),
            "random_state": 42,
            "n_jobs": -1,
        }

        # log model parameters
        mlflow.log_params(params)

        xgb_reg = LGBMRegressor(**params)
        model = TransformedTargetRegressor(regressor=xgb_reg,transformer=pt)

        # train the model
        model.fit(X_train_trans,y_train)

        # get the predictions
        y_pred_train = model.predict(X_train_trans)
        y_pred_test = model.predict(X_test_trans)


        # perform cross validation
        cv_score = cross_val_score(model,
                                X_train_trans,
                                y_train,
                                cv=5,
                                scoring="neg_mean_absolute_error",
                                n_jobs=-1)

        # mean score
        mean_score = -(cv_score.mean())
        # log avg cross val error
        mlflow.log_metric("cross_val_error",mean_score)

        return mean_score

In [None]:
# create optuna study
study = optuna.create_study(direction="minimize")

with mlflow.start_run(run_name="best_model"):
    # optimize the objective function
    study.optimize(objective, n_trials=50, n_jobs=-1, show_progress_bar=True)

    # log the best parameters
    mlflow.log_params(study.best_params)

    # log the best score
    mlflow.log_metric("best_score", study.best_value)

    # train the model on best parameters
    best_lgbm = LGBMRegressor(**study.best_params)
    best_lgbm.fit(X_train_trans, y_train_pt.values.ravel())

    # get the predictions
    y_pred_train = best_lgbm.predict(X_train_trans)
    y_pred_test = best_lgbm.predict(X_test_trans)

    # get the actual predictions values
    y_pred_train_org = pt.inverse_transform(y_pred_train.reshape(-1, 1))
    y_pred_test_org = pt.inverse_transform(y_pred_test.reshape(-1, 1))

    # perform cross validation
    model = TransformedTargetRegressor(regressor=best_lgbm,
                                       transformer=pt)

    scores = cross_val_score(model,
                             X_train_trans,
                             y_train,
                             scoring="neg_mean_absolute_error",
                             cv=5, n_jobs=-1)

    # log metrics
    mlflow.log_metric("training_error", mean_absolute_error(y_train, y_pred_train_org))
    mlflow.log_metric("test_error", mean_absolute_error(y_test, y_pred_test_org))
    mlflow.log_metric("training_r2", r2_score(y_train, y_pred_train_org))
    mlflow.log_metric("test_r2", r2_score(y_test, y_pred_test_org))
    mlflow.log_metric("cross_val", -scores.mean())

    # ---- FIXED PART ----
    # save model locally
    model_path = "best_lgbm.pkl"
    joblib.dump(best_lgbm, model_path)

    # log as artifact (works with DagsHub MLflow)
    mlflow.log_artifact(model_path, artifact_path="model")

In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
from optuna.visualization.matplotlib import plot_optimization_history

plot_optimization_history(study)
