In [None]:
!pip install catboost

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score

In [None]:
df = pd.read_csv('/content/8_final_data.csv')

In [None]:
df.head()

Unnamed: 0,Rider_age,Rider_rating,Weatherconditions,traffic_type,Vehicle_condition,multiple_deliveries,Festival,City_type,time_taken,order_hour,order_time_of_day,distance
0,37.0,4.9,Sunny,high,2,0.0,no,urban,24,11.0,Morning,3.025149
1,34.0,4.5,Stormy,jam,2,1.0,no,metropolitian,33,19.0,Evening,20.18353
2,23.0,4.4,Sandstorms,low,0,1.0,no,urban,26,8.0,Morning,1.552758
3,38.0,4.7,Sunny,medium,0,1.0,no,metropolitian,21,18.0,Afternoon,7.790401
4,32.0,4.6,Cloudy,high,1,1.0,no,metropolitian,30,13.0,Afternoon,6.210138


In [None]:
X = df.drop('time_taken', axis=1)
y = df['time_taken']

In [None]:
num = X.select_dtypes('number').columns.to_list()
cat = X.select_dtypes('object').columns.to_list()

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(),num),
        ('cat', OrdinalEncoder(),cat)
    ],
    remainder='passthrough'
)

## 1.Decission Tree

In [None]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor())
])

In [None]:
para_grid = {
    'model__criterion': ['absolute_error'],
    'model__max_depth': [5, 10, 15, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__max_features': [None, 'sqrt', 'log2'],
    'model__ccp_alpha': [0.0, 0.01, 0.05]
}

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
search = RandomizedSearchCV(pipe, para_grid, cv=kfold, scoring='r2',n_jobs=-1)

In [None]:
search.fit(X,y)

In [None]:
search.best_score_

np.float64(0.7424934082757398)

In [None]:
search.best_params_

{'model__min_samples_split': 5,
 'model__max_features': None,
 'model__max_depth': 15,
 'model__criterion': 'absolute_error',
 'model__ccp_alpha': 0.0}

In [None]:
dt_pipe = search.best_estimator_

In [None]:
import pickle
with open('dt_pipe.pkl', 'wb') as file:
    pickle.dump(dt_pipe, file)

In [None]:
with open('dt_pipe.pkl', 'rb') as file:
    dt_pipe = pickle.load(file)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dt_pipe.fit(X_train, y_train)
y_pred = dt_pipe.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

MAE: 3.59213273266674
R2: 0.7365880770999891


## 2.Random Forest

In [None]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

param_grid = {
    'model__n_estimators': [100,150,200],
    'model__criterion': ['absolute_error'],
    'model__max_depth': [10, 15, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['auto', 'sqrt', 'log2']
}
X_sample,_,y_sample,_ = train_test_split(X, y, train_size=15000, random_state=42)

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
search = RandomizedSearchCV(pipe, param_grid, cv=kfold, scoring='r2', n_jobs=-1)
search.fit(X_sample, y_sample)
search.best_params_

In [None]:
search.best_score_

np.float64(0.7831394988903592)

In [None]:
rf_pipe = search.best_estimator_

In [None]:
with open('rf_pipe.pkl', 'wb') as file:
    pickle.dump(rf_pipe, file)

In [None]:
with open('rf_pipe.pkl', 'rb') as file:
    rf_pipe = pickle.load(file)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_pipe.fit(X_train, y_train)
y_pred = rf_pipe.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

MAE: 3.276970662564554
R2: 0.799754987552062


## 3.Gradient Boosting

In [None]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor())
])
param_grid = {
    'model__n_estimators': [200,250, 300],
    'model__max_depth': [3, 5, 7, 10],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__loss': ['absolute_error'],
}

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
search = RandomizedSearchCV(pipe, param_grid, cv=kfold, scoring='r2', n_jobs=-1)
search.fit(X,y)
search.best_params_

In [None]:
search.best_score_

np.float64(0.7879508595096777)

In [None]:
gb_pipe = search.best_estimator_

In [None]:
with open('gb_pipe.pkl', 'wb') as file:
    pickle.dump(gb_pipe, file)

In [None]:
with open('gb_pipe.pkl', 'rb') as file:
    gb_pipe = pickle.load(file)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
gb_pipe.fit(X_train, y_train)
y_pred = gb_pipe.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

MAE: 3.2466957748142162
R2: 0.8000835243216263


## 4.Xgboost

In [None]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor())
])
param_grid = {
    'model__n_estimators': [150,200,250],
    'model__max_depth': [5,7,9],
    'model__reg_alpha': [0.1,0.5,0.7],
    'model__reg_lambda': [1.0,1.2,1.5],
    'model__min_child_weight': [1, 3, 5]
}

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
search = GridSearchCV(pipe, param_grid, cv=kfold, scoring='neg_mean_absolute_error', n_jobs=-1)
search.fit(X,y)
search.best_params_

{'model__max_depth': 5,
 'model__min_child_weight': 5,
 'model__n_estimators': 150,
 'model__reg_alpha': 0.5,
 'model__reg_lambda': 1.0}

In [None]:
search.best_score_

np.float64(-3.2756401538848876)

In [None]:
xgb_pipe = search.best_estimator_

In [None]:
with open('xgb_pipe.pkl', 'wb') as file:
    pickle.dump(xgb_pipe, file)

In [None]:
with open('xgb_pipe.pkl', 'rb') as file:
    xgb_pipe = pickle.load(file)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=3)
xgb_pipe.fit(X_train, y_train)
y_pred = xgb_pipe.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

MAE: 3.235880136489868
R2: 0.8081061840057373
