# Build model from the cleaned Data :
## Task: Use various regression algorithms to see the best one

In [4]:
# loading the data
import pandas as pd
import numpy as np
import math
path = r'Datasets/cleaned_zomato_dataset.csv'
Data = pd.read_csv(path)
# little preprocessing of boolean columns
def haversine(lat1, lon1, lat2, lon2):
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    
    # Differences
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.asin(math.sqrt(a))
    
    # Earth radius in kilometers
    r = 6371
    
    return c * r
for col in Data.columns:
  if Data[col].dtype == 'bool':
    Data[col] = Data[col].astype(int)
Data.head()
Data['distance_km'] = Data.apply(lambda row: haversine(row['Restaurant_latitude'], row['Restaurant_longitude'], row['Delivery_location_latitude'], row['Delivery_location_longitude']), axis=1)
Data[:10]

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Vehicle_condition,multiple_deliveries,Time_taken (min),Weather_conditions_Fog,...,Type_of_order_Drinks,Type_of_order_Meal,Type_of_order_Snack,Type_of_vehicle_electric_scooter,Type_of_vehicle_motorcycle,Type_of_vehicle_scooter,Festival_Yes,City_Semi-Urban,City_Urban,distance_km
0,36.0,4.3,1.626032,0.341547,1.763005,0.34431,1.163958,2.5,46.0,1,...,0,0,1,0,1,0,0,0,0,15.233795
1,21.0,4.7,-0.85698,0.265581,-1.011851,0.260575,-0.027871,1.0,23.0,0,...,0,1,0,0,1,0,0,0,0,17.229854
2,23.0,4.7,0.188686,0.161105,0.161812,0.149735,-0.027871,1.0,21.0,0,...,1,0,0,0,0,1,0,0,0,3.24462
3,34.0,4.3,1.695864,0.24381,1.834113,0.236037,-1.219701,0.0,20.0,0,...,0,0,0,0,1,0,0,0,0,15.396903
4,24.0,4.7,1.153926,0.443221,1.244366,0.457321,-0.027871,1.0,41.0,1,...,0,0,1,0,0,1,0,0,0,10.177946
5,29.0,4.5,0.263673,0.113917,0.24549,0.098604,1.163958,1.0,20.0,0,...,0,0,0,1,0,0,0,0,0,2.643327
6,35.0,4.3,-0.57502,0.280756,-0.697215,0.277018,-0.027871,1.0,33.0,0,...,0,1,0,0,0,1,0,0,0,13.593798
7,33.0,4.3,0.192383,0.154854,0.168665,0.143909,1.163958,1.0,40.0,0,...,0,0,1,0,1,0,0,0,0,2.904626
8,34.0,4.9,0.049878,0.357339,0.004192,0.361421,-1.219701,1.0,41.0,0,...,0,0,1,0,1,0,0,0,0,5.100311
9,21.0,4.7,0.676155,0.791914,0.697595,0.830418,-1.219701,1.0,15.0,0,...,0,1,0,0,1,0,0,0,1,4.900223


In [23]:
# importing the libraries for training the model and testing
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression


# Extraction of training and testing data:
X = Data.drop(columns=['Time_taken (min)','Delivery_location_longitude','Delivery_location_latitude','Restaurant_longitude','Restaurant_latitude'])
y = Data['Time_taken (min)']
# train test split
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,shuffle=True)


In [24]:
DecisionTreeModel = DecisionTreeRegressor(random_state=42)
param_grid = {
    'max_depth': [None, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None,  'sqrt', 'log2']
}
grid_searchDT = GridSearchCV(estimator=DecisionTreeModel,param_grid=param_grid,cv=5,scoring='neg_mean_squared_error')
grid_searchDT.fit(X_train,y_train)

print(f"grid-search best param:{grid_searchDT.best_params_}")
best_modelDT = grid_searchDT.best_estimator_
print(best_modelDT.score(X_test,y_test))

grid-search best param:{'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
0.6900946909499921


In [None]:
RandomForestModel = RandomForestRegressor(random_state=42)
RandomForestModel.fit(X_train,y_train)
print(RandomForestModel.score(X_test,y_test))

0.7231735414838372


In [22]:
LinearModel = LinearRegression()
LinearModel.fit(X_train,y_train)
print(LinearModel.score(X_train,y_train))

0.5320059081708424
