In [None]:
!pip install catboost

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('/content/8_final_data.csv')

In [7]:
df.head()

Unnamed: 0,Rider_age,Rider_rating,Weatherconditions,traffic_type,Vehicle_condition,multiple_deliveries,Festival,City_type,time_taken,order_hour,order_time_of_day,distance
0,37.0,4.9,Sunny,high,2,0.0,no,urban,24,11.0,Morning,3.025149
1,34.0,4.5,Stormy,jam,2,1.0,no,metropolitian,33,19.0,Evening,20.18353
2,23.0,4.4,Sandstorms,low,0,1.0,no,urban,26,8.0,Morning,1.552758
3,38.0,4.7,Sunny,medium,0,1.0,no,metropolitian,21,18.0,Afternoon,7.790401
4,32.0,4.6,Cloudy,high,1,1.0,no,metropolitian,30,13.0,Afternoon,6.210138


In [8]:
X = df.drop('time_taken', axis=1)
y = df['time_taken']

In [9]:
num = X.select_dtypes('number').columns.to_list()
cat = X.select_dtypes('object').columns.to_list()

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(),num),
        ('cat', OrdinalEncoder(),cat)
    ],
    remainder='passthrough'
)

## 1.Decission Tree

In [None]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor())
])
para_grid = {
    'model__criterion': ['absolute_error'],
    'model__max_depth': [5, 10, 15, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__max_features': [None, 'sqrt', 'log2'],
    'model__ccp_alpha': [0.0, 0.01, 0.05]
}
kfold = KFold(n_splits=5, shuffle=True, random_state=3)
dt_search = RandomizedSearchCV(pipe, para_grid, cv=kfold, scoring='r2',n_jobs=-1)
dt_search.fit(X,y)
dt_search.best_params_

In [None]:
dt_search.best_score_

In [None]:
dt_pipe = dt_search.best_estimator_

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dt_pipe.fit(X_train, y_train)
y_pred = dt_pipe.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

## 2.Random Forest

In [None]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

param_grid = {
    'model__n_estimators': [100,150,200],
    'model__criterion': ['absolute_error'],
    'model__max_depth': [10, 15, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['auto', 'sqrt', 'log2']
}
X_sample,_,y_sample,_ = train_test_split(X, y, train_size=15000, random_state=42)

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
rf_search = RandomizedSearchCV(pipe, param_grid, cv=kfold, scoring='r2', n_jobs=-1)
rf_search.fit(X_sample, y_sample)
rf_search.best_params_

In [None]:
rf_search.best_score_

np.float64(0.7831394988903592)

In [123]:
rf_pipe = rf_search.best_estimator_

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_pipe.fit(X_train, y_train)
y_pred = rf_pipe.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

MAE: 3.276970662564554
R2: 0.799754987552062


## 3.Gradient Boosting

In [None]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor())
])
param_grid = {
    'model__n_estimators': [200,250, 300],
    'model__max_depth': [3, 5, 7, 10],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__loss': ['absolute_error'],
}

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
gb_search = RandomizedSearchCV(pipe, param_grid, cv=kfold, scoring='r2', n_jobs=-1)
gb_search.fit(X,y)
gb_search.best_params_

In [None]:
gb_search.best_score_

np.float64(0.7879508595096777)

In [124]:
gb_pipe = gb_search.best_estimator_

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
gb_pipe.fit(X_train, y_train)
y_pred = gb_pipe.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

MAE: 3.2466957748142162
R2: 0.8000835243216263


## 4.Xgboost

In [48]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor())
])
param_grid = {
    'model__n_estimators': [150,200,250],
    'model__max_depth': [5,7,9],
    'model__reg_alpha': [0.1,0.5,0.7],
    'model__reg_lambda': [1.0,1.2,1.5],
    'model__min_child_weight': [1, 3, 5]
}

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
xg_search = GridSearchCV(pipe, param_grid, cv=kfold, scoring='neg_mean_absolute_error', n_jobs=-1)
xg_search.fit(X,y)
xg_search.best_params_

{'model__max_depth': 5,
 'model__min_child_weight': 5,
 'model__n_estimators': 150,
 'model__reg_alpha': 0.5,
 'model__reg_lambda': 1.0}

In [49]:
xg_search.best_score_

np.float64(-3.2756401538848876)

In [125]:
xgb_pipe = xg_search.best_estimator_

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=3)
xgb_pipe.fit(X_train, y_train)
y_pred = xgb_pipe.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

MAE: 3.235880136489868
R2: 0.8081061840057373


## 5.Lightgbm

In [None]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(verbose=0))
])
param_grid = {
    'model__n_estimators': [300, 500, 700],
    'model__max_depth': [5, 10, 15],
    'model__num_leaves': [31, 50, 70],
    'model__reg_alpha': [0.1,0.5,0.7],
    'model__reg_lambda': [1.0,1.2,1.5],
}

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
lg_search = RandomizedSearchCV(pipe, param_grid, cv=kfold, scoring='r2', n_jobs=-1)
lg_search.fit(X,y)
lg_search.best_params_

In [75]:
lg_search.best_score_

np.float64(0.8098084185602203)

In [126]:
lgbm_pipe = lg_search.best_estimator_

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
lgbm_pipe.fit(X_train, y_train)
y_pred = lgbm_pipe.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001540 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 511
[LightGBM] [Info] Number of data points in the train set: 36401, number of used features: 11
[LightGBM] [Info] Start training from score 26.306448
MAE: 3.227258910414883
R2: 0.812185734771514


## 6.Catboost

In [None]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', CatBoostRegressor(verbose=0))
])
param_grid = {
    'model__iterations': [300, 500, 700],
    'model__depth': [4, 6, 8, 10],
    'model__l2_leaf_reg': [1, 3, 5, 7],
    'model__bagging_temperature': [0, 0.5, 1],
    'model__random_strength': [1, 5, 10],
    'model__border_count': [32, 64, 128],
}
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cat_search = RandomizedSearchCV(pipe, param_grid, cv=kfold, scoring='r2', n_jobs=-1)
cat_search.fit(X,y)
cat_search.best_params_

In [93]:
cat_search.best_score_

np.float64(0.8095620719104156)

In [94]:
cat_pipe = cat_search.best_estimator_

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
cat_pipe.fit(X_train, y_train)
y_pred = cat_pipe.predict(X_test)

In [119]:
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

MAE: 3.175175182130006
R2: 0.8155654331193896


## 7.Ensembled model

### voting ensemble

In [14]:
from sklearn.ensemble import VotingRegressor

ensemble = VotingRegressor(
    estimators=[
        ('gb', gb_pipe),
        ('xgb', xgb_pipe),
        ('lgbm', lgbm_pipe),
        ('cat', cat_pipe)
    ],
    n_jobs=-1
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
ensemble.fit(X_train, y_train)

y_pred = ensemble.predict(X_test)
print("Voting Ensemble MAE:", mean_absolute_error(y_test, y_pred))
print("Voting Ensemble R²:", r2_score(y_test, y_pred))

Voting Ensemble MAE: 3.153682013299317
Voting Ensemble R²: 0.8181026746423183


### weighted voting

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

xgb_pipe.fit(X_train, y_train)
lgbm_pipe.fit(X_train, y_train)
cat_pipe.fit(X_train, y_train)
gb_pipe.fit(X_train, y_train)

y_xgb = xgb_pipe.predict(X_test)
y_lgb = lgbm_pipe.predict(X_test)
y_cat = cat_pipe.predict(X_test)
y_gb  = gb_pipe.predict(X_test)

final_pred = (
    0.1 * y_xgb +
    0.4 * y_lgb +
    0.4 * y_cat +
    0.1 * y_gb
)

print("Weighted Ensemble MAE:", mean_absolute_error(y_test, final_pred))
print("Weighted Ensemble R²:", r2_score(y_test, final_pred))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003592 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 515
[LightGBM] [Info] Number of data points in the train set: 36401, number of used features: 11
[LightGBM] [Info] Start training from score 26.309991
0:	learn: 9.2070235	total: 16.3ms	remaining: 8.16s
1:	learn: 9.0218686	total: 31.3ms	remaining: 7.78s
2:	learn: 8.8458287	total: 45.5ms	remaining: 7.53s
3:	learn: 8.6883888	total: 59ms	remaining: 7.32s
4:	learn: 8.5399684	total: 72.6ms	remaining: 7.18s
5:	learn: 8.3852510	total: 83.1ms	remaining: 6.84s
6:	learn: 8.2311961	total: 96.9ms	remaining: 6.83s
7:	learn: 8.1132972	total: 111ms	remaining: 6.8s
8:	learn: 7.9698379	total: 124ms	remaining: 6.77s
9:	learn: 7.8357230	total: 139ms	remaining: 6.8s
10:	learn: 7.6987786	total: 154ms	remaining: 6.83s
11:	learn: 7.5945270	total: 162ms	remain

### stacking ensemble

In [18]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV

stack = StackingRegressor(
    estimators=[
        ('gb', gb_pipe),
        ('xgb', xgb_pipe),
        ('lgbm', lgbm_pipe),
        ('cat', cat_pipe),
    ],
    final_estimator=RidgeCV(),
    cv=5,
    n_jobs=-1
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
print("Stacked MAE:", mean_absolute_error(y_test, y_pred))
print("Stacked R²:", r2_score(y_test, y_pred))

Stacked MAE: 3.147495004236852
Stacked R²: 0.8189465782446451


## Final model
- we will use **weighted ensemble model**

In [21]:
xgb_pipe.fit(X,y)
lgbm_pipe.fit(X,y)
cat_pipe.fit(X,y)
gb_pipe.fit(X,y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004398 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 520
[LightGBM] [Info] Number of data points in the train set: 45502, number of used features: 11
[LightGBM] [Info] Start training from score 26.297591
0:	learn: 9.2112812	total: 52ms	remaining: 25.9s
1:	learn: 9.0239253	total: 103ms	remaining: 25.7s
2:	learn: 8.8420769	total: 138ms	remaining: 22.8s
3:	learn: 8.6824422	total: 184ms	remaining: 22.9s
4:	learn: 8.5319714	total: 227ms	remaining: 22.5s
5:	learn: 8.3762437	total: 268ms	remaining: 22.1s
6:	learn: 8.2208961	total: 327ms	remaining: 23s
7:	learn: 8.1013651	total: 377ms	remaining: 23.2s
8:	learn: 7.9481383	total: 464ms	remaining: 25.3s
9:	learn: 7.8077142	total: 519ms	remaining: 25.4s
10:	learn: 7.6698538	total: 586ms	remaining: 26s
11:	learn: 7.5659230	total: 624ms	remaining: 25.

In [23]:
with open('Xgboost.pkl', 'wb') as file:
    pickle.dump(xgb_pipe, file)
with open('Lightgbm.pkl', 'wb') as file:
    pickle.dump(lgbm_pipe, file)
with open('Catboost.pkl', 'wb') as file:
    pickle.dump(cat_pipe, file)
with open('Gradientboost.pkl', 'wb') as file:
    pickle.dump(gb_pipe, file)