In [13]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,cross_val_score,KFold,GridSearchCV
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
from xgboost import XGBRegressor
import joblib

In [14]:
def categorize_time(time_str):
    hour = int(time_str.split(':')[0])
    if 4<=hour<=11:
        return 'Morning'
    elif 11<=hour<=16:
        return 'afternoon'
    elif 16<=hour<=19:
        return 'evening'
    else:
        return 'Night'

In [15]:
class Porter_delivery_time_prediction:
    def __init__(self,model_type='xgboost'):
        self.model_type = model_type
    def preprocess_data(self,train_data):
        ''' preprocesses the input data '''
        train_data['created_at'] = pd.to_datetime(train_data['created_at'])
        train_data['actual_delivery_time'] = pd.to_datetime(train_data['actual_delivery_time'])
        train_data['delivery_time']=(train_data['actual_delivery_time'] - train_data['created_at']).dt.total_seconds()/60
        train_data['available_partners']=train_data['total_onshift_partners'] - train_data['total_busy_partners']
        train_data['ordered_time']=train_data['created_at'].dt.time
        train_data['ordered_time'] = train_data['ordered_time'].astype(str)
        train_data['day_time']=train_data['ordered_time'].apply(categorize_time)
        train_data['ordered_time'] = train_data['ordered_time'].astype(str)
        train_data['day_time']=train_data['ordered_time'].apply(categorize_time)
        train_data.to_csv('training_data.csv',index=False)
        train_data.drop(columns=['created_at','actual_delivery_time','ordered_time','market_id','store_id','store_primary_category'],inplace=True)
        print('data preprocessing is completed')
        return train_data
    def cleaning_data(self,train_data):
        integer_features  = []
        for feature in train_data.columns:
            if feature not in ['order_protocol','day_time']:
                integer_features.append(feature)
        q1=train_data[integer_features].quantile(0.25)
        q3=train_data[integer_features].quantile(0.75)
        iqr=q3-q1
        for feature in integer_features:
            lower_bound = q1[feature] - 1.5*iqr[feature]
            upper_bound = q3[feature] + 1.5*iqr[feature]
            train_data=train_data[(train_data[feature]>=lower_bound) & (train_data[feature]<=upper_bound)]
        train_data_new = pd.get_dummies(train_data,columns=['day_time'],drop_first=True,dtype='int')
        x=train_data_new.drop(columns=['delivery_time'])
        y=train_data_new['delivery_time']
        print(x.columns)
        print('data cleaning is completed')
        return x,y
        
    def train_model(self,x_train,y_train):
        #model = XGBRegressor(objective='reg:squarederror',colsample_bytree: 0.8, learning_rate: 0.01, max_depth: 9, n_estimators: 500, subsample: 0.8,random_state=42)
        model = XGBRegressor(
             objective='reg:squarederror',
            colsample_bytree=0.8,
            learning_rate=0.01,
            max_depth=9,
            n_estimators=500,
            subsample=0.8,
             random_state=42
        )

        #X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)
        model.fit(x_train,y_train)
        self.model = model
        print('training the model is completed')
        return model
    def evaluate_model(self,x_test,y_test):
        y_pred = self.model.predict(x_test)
        mae = mean_absolute_error(y_test, y_pred)
        rmse=np.sqrt(mean_squared_error(y_test,y_pred))
        print('evaluating the model is completed')
        print("Validation MAE:", mae)
        print('root mean squared error',rmse)
    def save_model(self,file_path):
        joblib.dump(self.model,file_path)
    def load_model(self,filepath):
        model=joblib.load(filepath)
    def predict(self,x):
        y_pred = self.model.predict(x)
        return y_pred
        
        

In [16]:
def main():
    delivery_time_prediction_system = Porter_delivery_time_prediction()
    train_data = pd.read_csv(r"C:\ML\datasets\porter delivery estimation time\dataset.csv")
    train_data =delivery_time_prediction_system.preprocess_data(train_data)
    x,y=delivery_time_prediction_system.cleaning_data(train_data)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
    delivery_time_prediction_system.train_model(x_train,y_train)
    delivery_time_prediction_system.evaluate_model(x_test,y_test)
    delivery_time_prediction_system.save_model('delivery_model.pkl')
    #y_pred = delivery_time_prediction.predict(sample_data)
    
main()

data preprocessing is completed
Index(['order_protocol', 'total_items', 'subtotal', 'num_distinct_items',
       'min_item_price', 'max_item_price', 'total_onshift_partners',
       'total_busy_partners', 'total_outstanding_orders', 'available_partners',
       'day_time_Night', 'day_time_afternoon', 'day_time_evening'],
      dtype='object')
data cleaning is completed
training the model is completed
evaluating the model is completed
Validation MAE: 10.236930542496522
root mean squared error 12.91729969057304
