# Test With All Features

In [1]:
import pandas as pd
import numpy as np

In [13]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("../artifacts/raw/raw.csv")

In [4]:
X = df.drop(columns=["Delivery_Time", "Order_ID"])
y = df["Delivery_Time"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [8]:
print("Categorical Features:", cat_cols)
print("Numeric Features:", num_cols)

Categorical Features: ['Order_Date', 'Order_Time', 'Pickup_Time', 'Weather', 'Traffic', 'Vehicle', 'Area', 'Category', 'DayOfWeek']
Numeric Features: ['Agent_Age', 'Agent_Rating', 'Store_Latitude', 'Store_Longitude', 'Drop_Latitude', 'Drop_Longitude', 'Distance_km', 'Order_Hour', 'Day_Of_Week', 'Pickup_Delay']


In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

In [10]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(objective="reg:squarederror", eval_metric="rmse"),
    "Support Vector Regressor": SVR(),
    "KNN Regressor": KNeighborsRegressor(),
    "LGBMRegressor" : LGBMRegressor()
}

In [11]:
results = []

In [16]:
for name, model in models.items():
    # Create pipeline (encoding + model)
    pipe = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model", model)])
    
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)

    rmse = root_mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    results.append([name, rmse, mae, r2])



In [17]:
results_df = pd.DataFrame(results, columns=["Model", "RMSE", "MAE", "R2 Score"])
print("\nModel Comparison:\n")
print(results_df.sort_values(by="RMSE"))


Model Comparison:

                       Model       RMSE        MAE  R2 Score
13             LGBMRegressor  21.979246  17.112851  0.816304
8              Random Forest  22.587027  17.340061  0.806004
10                   XGBoost  22.760143  17.729929  0.803019
9          Gradient Boosting  24.575352  19.035239  0.770346
3              Decision Tree  30.559340  22.792178  0.644891
7              Decision Tree  30.586232  22.768896  0.644266
4          Linear Regression  31.655251  25.100438  0.618965
0          Linear Regression  31.655251  25.100438  0.618965
1                      Ridge  31.660001  25.100963  0.618850
5                      Ridge  31.660001  25.100963  0.618850
6                      Lasso  34.231919  26.851554  0.554409
2                      Lasso  34.231919  26.851554  0.554409
12             KNN Regressor  46.749800  36.016493  0.168938
11  Support Vector Regressor  49.360900  39.296804  0.073512


# Test With Top 15 features

In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [19]:
df = pd.read_csv("../artifacts/raw/raw.csv")

In [20]:
X = df.drop(columns=["Delivery_Time", "Order_ID"])
y = df["Delivery_Time"]

In [21]:
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [22]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

In [None]:
# 1. Encode Full Dataset

X_full = preprocessor.fit_transform(X)
feature_names = preprocessor.get_feature_names_out()


# 2. Fit Random Forest on Encoded Data

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_full, y)

importances = rf_model.feature_importances_

feat_imp = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
})

# Get top 15 features (exact encoded names)
top_15_features = feat_imp.sort_values(by="Importance", ascending=False).head(15)["Feature"].tolist()
print("\nTop 15 Features Selected:", top_15_features)


# 3. Select Top 15 Feature Columns

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_full, y, test_size=0.2, random_state=42)

# Get indices of top features
top_idx = [list(feature_names).index(f) for f in top_15_features]

X_train_top = X_train[:, top_idx]
X_test_top = X_test[:, top_idx]


# 4. Train Models on Top 15 Features

models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(objective="reg:squarederror", eval_metric="rmse"),
    "Support Vector Regressor": SVR(),
    "KNN Regressor": KNeighborsRegressor(),
    "LGBMRegressor" : LGBMRegressor()
}

results = []

for name, model in models.items():
    model.fit(X_train_top, y_train)
    preds = model.predict(X_test_top)

    rmse = root_mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    results.append([name, rmse, mae, r2])

results_df = pd.DataFrame(results, columns=["Model", "RMSE", "MAE", "R2 Score"])
print("\nModel Comparison (Top 15 Features):\n")
print(results_df.sort_values(by="RMSE"))



Top 15 Features Selected: ['cat__Category_Grocery', 'num__Agent_Rating', 'cat__Traffic_Low', 'num__Distance_km', 'num__Agent_Age', 'cat__Weather_Sunny', 'cat__Weather_Cloudy', 'cat__Weather_Fog', 'cat__Vehicle_motorcycle', 'cat__Traffic_Medium', 'num__Drop_Longitude', 'num__Drop_Latitude', 'cat__Traffic_Jam', 'num__Store_Longitude', 'num__Store_Latitude']

Model Comparison (Top 15 Features):

                      Model       RMSE        MAE  R2 Score
9             LGBMRegressor  22.152983  17.228092  0.813389
6                   XGBoost  22.929962  17.806622  0.800069
4             Random Forest  23.684803  18.058934  0.786689
5         Gradient Boosting  24.820469  19.290728  0.765742
3             Decision Tree  31.312717  23.079118  0.627166
1                     Ridge  32.830680  25.974115  0.590142
0         Linear Regression  32.833050  25.981877  0.590083
2                     Lasso  34.629407  27.168219  0.544001
8             KNN Regressor  42.051205  32.017364  0.327595
7  



## What the Results Say
### All Features

- Best models:

    - LGBMRegressor (RMSE ~ 21.98, R² ~ 0.816)

    - Random Forest (RMSE ~ 22.58, R² ~ 0.806)

    - XGBoost (RMSE ~ 22.76, R² ~ 0.803)

### Top 15 Features

- Best models:

    - LGBMRegressor (RMSE ~ 22.15, R² ~ 0.813)

    - XGBoost (RMSE ~ 22.93, R² ~ 0.800)

    - Random Forest (RMSE ~ 23.68, R² ~ 0.786)

### 🔹 Interpretation

- Performance is very similar between all features and top 15 features.

    - LGBM: 21.98 → 22.15 RMSE (almost no change).

    - Random Forest: 22.58 → 23.68 RMSE (slight drop).

    - XGBoost: 22.76 → 22.93 RMSE (almost identical).

👉 This means most predictive power is concentrated in the top 15 features.

### 🔹 What to Do

### 1. If you want maximum accuracy → Use all features with LGBM / RF / XGBoost.

- Slightly better performance.

- But more complex model.

### 2. If you want interpretability & efficiency → Use top 15 features.

- Model is simpler, easier to explain.

- Very small drop in accuracy (≈ 0.3–1 RMSE difference).

# Final Conclusion :- 
- Choosing Model with top 15 features because of less model complexity and lesser model size , easy to deploy because of ***lightgbm***