In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/TrangNgan'

Mounted at /content/drive
/content/drive/MyDrive/TrangNgan


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

test_size = 0.2
random_state = 36
num_folds = 10
seed = 7
scoring = 'neg_mean_squared_error'

In [2]:
def read_data(filename):
  data = pd.read_csv(filename, encoding = "ISO-8859-1")
  data.dropna(inplace = True)
  return data

def label_encoder(data):
  le = LabelEncoder()
  is_Category = data.dtypes == object
  category_column_list = data.columns[is_Category].tolist()
  data[category_column_list] = data[category_column_list].apply(lambda col: le.fit_transform(col))
  return data

# **Dataset 1**

In [3]:
data1 = read_data('train.csv')
# data1['Distance'] = np.sqrt((data1['Delivery_location_latitude'] - data1['Restaurant_latitude'])**2 + (data1['Restaurant_longitude'] - data1['Delivery_location_longitude'])**2)
data1 = label_encoder(data1)
data1.head(3)

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Distance,Order_Date,...,Time_Order_picked,Weather,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken
1,5060,1299,27.0,21,305,43,2378,513,169,39,...,153,5,2,2,0,1,1.0,0,2,14
2,36545,14,34.0,24,442,255,3940,2929,807,26,...,164,0,2,1,0,1,1.0,0,0,27
3,39234,265,23.0,25,356,228,3010,2607,2055,30,...,106,2,3,2,3,1,1.0,0,0,21


In [4]:
X1 = data1.iloc[:, data1.columns != 'Time_taken'].values
y1 = data1['Time_taken'].values.reshape(-1, 1)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=test_size, random_state=random_state)

model = GradientBoostingRegressor().fit(X_train1, y_train1.reshape(-1,))
print(f"MSE testing = {mean_squared_error(y_test1, model.predict(X_test1))}")

MSE testing = 18.706980497219075


# **Dataset 2**

In [5]:
data2 = read_data('train2.csv')
data2 = label_encoder(data2)
data2.head(3)

Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time
0,5086,10,995,8,11,15,89,449,3
1,1785,30,1116,5,11,15,43,449,3
2,571,19,1182,6,11,16,1098,353,5


In [6]:
X2 = data2.iloc[:, data2.columns != 'Delivery_Time'].values
y2 = data2['Delivery_Time'].values.reshape(-1, 1)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=test_size, random_state=random_state)

model_2 = GradientBoostingRegressor().fit(X_train2, y_train2.reshape(-1,))
print(f"MSE testing = {mean_squared_error(y_test2, model_2.predict(X_test2))}")

MSE testing = 0.3536542574694685


# **Dataset 3**

In [7]:
data3 = read_data('train3.csv')
data3.head(3)

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_partners,total_busy_partners,total_outstanding_orders
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,df263d996281d984952c07998dc54358,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,f0ade77b43923b38237db569b016ba25,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0
8,2.0,2015-02-16 00:11:35,2015-02-16 00:38:01,f0ade77b43923b38237db569b016ba25,indian,3.0,4,4771,3,820,1604,8.0,6.0,18.0


In [8]:
data3['created_at'] = pd.to_datetime(data3['created_at'])
data3['actual_delivery_time'] = pd.to_datetime(data3['actual_delivery_time'])
data3['Time_taken'] = (data3['actual_delivery_time'] - data3['created_at']).dt.total_seconds().astype('int64')
data3.drop('store_id',axis=1,inplace=True)
data3 = data3.select_dtypes(include=['int64','float64'])

In [9]:
X3 = data3.iloc[:, data3.columns != 'Time_taken'].values
y3 = data3['Time_taken'].values.reshape(-1, 1)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=test_size, random_state=random_state)

model_3 = GradientBoostingRegressor().fit(X_train3, y_train3.reshape(-1,))
print(f"MSE testing = {mean_squared_error(y_test3, model_3.predict(X_test3))}")

MSE testing = 4104007.682985719
