In [None]:
import pandas as pd

In [2]:
df = pd.read_csv('data_before_modeling.csv')

In [3]:
pd.set_option('display.max_columns', None)  # Show all columns in the DataFrame
df.head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min),distance_km,distance_bin,Order_Datetime,Pickup_Datetime,prep_time_min,order_hour,order_day,is_weekend
0,0x4607,INDORES13DEL02,37,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:33:33,11:45:29,sunny,high,2,snack,motorcycle,0,no,urban,24,3.03,0-5,2022-03-19 11:33:33,2022-03-19 11:45:29,11.93,11,5,1
1,0xb379,BANGRES18DEL02,34,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:37,19:51:49,stormy,jam,2,snack,scooter,1,no,metropolitian,33,20.18,20-25,2022-03-25 19:45:37,2022-03-25 19:51:49,6.2,19,4,0
2,0x5d6d,BANGRES19DEL01,23,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,08:32:58,08:48:47,sandstorms,low,0,drinks,motorcycle,1,no,urban,26,1.55,0-5,2022-03-19 08:32:58,2022-03-19 08:48:47,15.82,8,5,1
3,0x7a6a,COIMBRES13DEL02,38,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:03:58,18:12:52,sunny,medium,0,buffet,motorcycle,1,no,metropolitian,21,7.79,5-10,2022-04-05 18:03:58,2022-04-05 18:12:52,8.9,18,1,0
4,0x70a2,CHENRES12DEL01,32,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:34:16,13:45:36,cloudy,high,1,snack,scooter,1,no,metropolitian,30,6.21,5-10,2022-03-26 13:34:16,2022-03-26 13:45:36,11.33,13,5,1


In [4]:
# dropping the unnecessary columns
df = df.drop(columns=['ID','Delivery_person_ID','Restaurant_latitude','Restaurant_longitude',	'Delivery_location_latitude','Delivery_location_longitude','Order_Date','Time_Orderd','Time_Order_picked','Order_Datetime','Pickup_Datetime','distance_bin'])

In [5]:
df.dtypes

Delivery_person_Age          int64
Delivery_person_Ratings    float64
Weatherconditions           object
Road_traffic_density        object
Vehicle_condition            int64
Type_of_order               object
Type_of_vehicle             object
multiple_deliveries          int64
Festival                    object
City                        object
Time_taken(min)              int64
distance_km                float64
prep_time_min              float64
order_hour                   int64
order_day                    int64
is_weekend                   int64
dtype: object

In [6]:
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
X=df.drop('Time_taken(min)',axis=1)
y=df['Time_taken(min)']

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Initialize the OrdinalEncoder for categorical features
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X[categorical_features] = encoder.fit_transform(X[categorical_features])

# Initialize the StandardScaler for numerical features
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
param_grid = {'n_estimators': [int(x) for x in np.linspace(100, 500, 5)],
    'max_depth': [10, 20, 30, 40, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None] 
}

rf = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(estimator=rf,param_distributions=param_grid,n_iter=20,scoring='r2',cv=5,random_state=42,verbose=2,n_jobs=-1,error_score='raise' )
random_search.fit(X_train, y_train)
best_rf = random_search.best_estimator_                                  

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
from sklearn.metrics import r2_score

y_pred = best_rf.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R² score on test data: {r2:.4f}")

In [None]:
import joblib

joblib.dump(best_rf, 'rf_model.pkl')
joblib.dump(encoder, 'encoder.pkl')
joblib.dump(scaler, 'scaler.pkl')

In [None]:
# Load the model + preprocessing s
model = joblib.load('rf_model.pkl')
encoder = joblib.load('encoder.pkl')
scaler = joblib.load('scaler.pkl')

# Example sample input (same order of columns as X)
sample = pd.DataFrame([{
    'Delivery_person_Age': 30,
    'Delivery_person_Ratings': 4.9,
    'Weatherconditions': 'Fog',
    'Road_traffic_density': 'High',
    'Vehicle_condition': 2,
    'Type_of_order': 'Snack',
    'Type_of_vehicle': 'motorcycle',
    'multiple_deliveries': 1,
    'Festival': 'No',
    'City': 'Urban',
    'distance_km': 5.2,
    'prep_time_min': 12.5,
    'order_hour': 14,
    'order_day': 3,
    'is_weekend': 0
}])
categorical_cols = sample.select_dtypes(include=['object']).columns.tolist()
numerical_cols = sample.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Apply preprocessing
sample[categorical_cols] = encoder.transform(sample[categorical_cols])

# Apply same scaling
sample[numerical_cols] = scaler.transform(sample[numerical_cols])

# Predict
pred = model.predict(sample)
print(f"Predicted delivery time: {pred[0]:.2f} minutes")
