In [1]:
base_url = "..\\experiment(Jupyter)_artifacts\\"
dataset_folder_path = "built_dataset"
train_file_name = "train.csv"
feature_engineering_artifacts_folder = "Feature_Engineering"
TARGET_FEATURE = "Time_taken"
useless_features = ['ID', 'Delivery_person_ID']
MAX_UNIQUE_VALUES_FOR_DISCRETE_FEATURES = 25
validation_file = "validation.csv"
tensorboard_logdir = "tensorboard_logs"
pipeline_file_name = 'pipeline.joblib'

# Loading Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from typing import List
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, StandardScaler
import joblib

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error, r2_score

pd.pandas.set_option('display.max_columns',None)
%matplotlib inline

# Creating the paths and folders

In [3]:
train_file_path = os.path.join(base_url,dataset_folder_path,train_file_name)

In [4]:
validation_file_path = os.path.join(base_url,dataset_folder_path,validation_file)

In [5]:
pipeline_file_path = os.path.join(base_url,pipeline_file_name)

# Preprocess Classes

In [6]:
class DropUndersiredFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, features_to_drop=['ID', 'Delivery_person_ID']):
        
        self.features_to_drop = features_to_drop

    def fit(self, X, y=None):
        return self  # nothing to do

    def transform(self, X, y=None):
        df = X.copy()
        for feature in self.features_to_drop:
            ## try and catch to avoid run time errors
            try:
                df.drop(feature, axis=1, inplace=True)
            except:
                pass

        return df

## Imputing Numerical features class

class ImputeNumericalFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.numerical_features_nan = []
        self.medians = []
        
    def fit(self,X,y=None):
        try:
            ### Numerical features having null features
            
            self.numerical_features_nan = [feature for feature in X.columns if X[feature].dtypes !='O' and X[feature].isnull().sum()>=1]
        
            self.medians = []
            for feature in self.numerical_features_nan:
                self.medians.append(X[feature].median())
        except:
            pass
        
        return self
    
    def transform(self, X, y=None):
        df = X.copy()
        for index, feature in enumerate(self.numerical_features_nan):

            try:
                median_value = self.medians[index]
                ## create new feature to capture nan values
                df[feature + '_nan'] = np.where(df[feature].isnull(), 1, 0)
                ### replace the missing values with median
                df[feature] = df[feature].fillna(median_value)

            except:
                pass
        return df
        
class ImputeCategoricalFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.categorical_features = []  # empty categorical features list

    def fit(self, X, y=None):
        try:
            # Get the categorical features
            self.categorical_features = [feature for feature in X.columns if X[feature].dtype == 'O']
        except:
            pass

        return self

    def transform(self, X, y=None):
        ## standardize the features
        df = X.copy()
        try:
            for feature in self.categorical_features:
                df[feature] = df[feature].str.lower()
        except:
            pass

        ## Handle missing values
        try:
            df[self.categorical_features] = df[self.categorical_features].fillna('missing')

        except:
            pass

        return df
class ExtractFromTemporalFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, temporal_features = ['Order_Date', 'Time_Order_picked', 'Time_Orderd']):
        
        self.temporal_features = temporal_features
    
    def fit(self, X, y = None):
        
        return self
    
    def transform(self, X, y = None):
        df = X.copy()
        
        try:
            ## Working on Order Date
            df[self.temporal_features[0]+"_Day"] = df[self.temporal_features[0]].str.split('-').str[0].astype(int)
            df[self.temporal_features[0]+"_Month"] = df[self.temporal_features[0]].str.split('-').str[1].astype(int)
            
            
            
            ### Working on Time Order Picked
            df[self.temporal_features[1]+"_Hr"] = df[self.temporal_features[1]].str.split(':').str[0].astype(int)
            df[self.temporal_features[1]+"_Min"] = df[self.temporal_features[1]].str.split(':').str[1].astype(int)
            
            df.drop(self.temporal_features, axis = 1, inplace = True)
            
            return df
        except Exception as e:
            print(str(e))
        
        
class GenerateDistanceFromLatitudeAndLongitude(BaseEstimator,TransformerMixin):
    def __init__(self,distance_features = ['Restaurant_latitude', 'Restaurant_longitude','Delivery_location_latitude','Delivery_location_longitude']):
        self.distance_features = distance_features
    
    def fit(self,X,y=None):
        #print('fit')
        return self
    
    def transform(self,X,y=None):
        df = X.copy()
        #print('transform')
        try:
            ## Convert features to radians
            for feature in self.distance_features:
                df[feature] = np.radians(df[feature])
                
            lat1 = df[self.distance_features[0]]
            lon1 = df[self.distance_features[1]]
            
            lat2 = df[self.distance_features[2]]
            lon2 = df[self.distance_features[3]]
            
            
            newlon = lon2 - lon1
            newlat = lat2 - lat1
            
            haver_formula = np.sin(newlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(newlon/2.0)**2
            
            dist = 2 * np.arcsin(np.sqrt(haver_formula ))
            
            km = 6367 * dist #6367 for distance in KM for miles use 3958

            df['Distance'] = km
            
            
            
            return df
         
        except Exception as e:
            print(str(e))
        
        
class Label_Encode(BaseEstimator, TransformerMixin):
    def __init__(self, ordinal_features = ['Festival','Road_traffic_density']):
        self.ordinal_features = ordinal_features
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        df = X.copy()
        try:
            #print(df[self.ordinal_features])
            df['Festival'] = df['Festival'].map({'no':0,'yes':1,'missing':0})
            df['Road_traffic_density']=df['Road_traffic_density'].map({'low':0,'medium' : 1,'high' : 2,'jam':3,'missing' : 0})
            
            
            return df
        except:
            print(str(e))

class OHE_Categorical_Features(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.categorical_features = []
        self.ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

    def fit(self, X, y=None):
        try:
            self.categorical_features = [feature for feature in X.columns if X[feature].dtype == 'O']
            self.ohe.fit(X[self.categorical_features])
            ##print(self.ohe.categories_)
        except:
            pass

        return self

    def transform(self, X, y=None):
        df = X.copy()
        try:
            cat_ohe = self.ohe.transform(df[self.categorical_features])

            ohe_df = pd.DataFrame(cat_ohe, columns=self.ohe.get_feature_names(input_features=self.categorical_features),
                                  index=df.index)
            df = pd.concat([df, ohe_df], axis=1).drop(columns=self.categorical_features, axis=1)
        except Exception as e:
            pass

        return df


class remove_high_corelated_features(BaseEstimator, TransformerMixin):
    @staticmethod
    ## We set a corelation threshold (probably 0.85)
    ## We iterate on corelation matrix and see if 2 features have this much or greater corelation.
    ## If they have, we will remove one of the features.

    def get_high_correlation_feature(dataset, threshold):
        col_corr = set() ## set of column names to be dropped
        corr_matrix = dataset.corr()

        for i in range(len(corr_matrix.columns)-1,-1,-1):
            for j in range(i-1,-1,-1):
                if abs(corr_matrix.iloc[i,j]) > threshold:
                    colname = corr_matrix.columns[i] ## getting the column name
                    #print(corr_matrix.columns[i],"-----" ,corr_matrix.columns[j], "====", corr_matrix.iloc[i,j])
                    col_corr.add(colname)
        return(col_corr)
            
    
    def __init__(self, threshold = 0.85):
        self.correlated_features = []
        self.threshold = threshold
    
    def fit(self,X,y=None):
        self.correlated_features= remove_high_corelated_features.get_high_correlation_feature(X,self.threshold)
        
        return self
        
    def transform(self,X,y=None):
        df = X.copy()
        
        for feature in self.correlated_features:
            try:
                df.drop(feature,axis = 1, inplace= True)
            except:
                continue
        return df

# Loading the data and pipeline

In [7]:
train_df = pd.read_csv(train_file_path)

In [8]:
validation_df = pd.read_csv(validation_file_path)

In [9]:
pipeline = joblib.load(pipeline_file_path)

# Seperating independant and dependant features

In [10]:
X_train,y_train = train_df.drop(TARGET_FEATURE, axis = 1), train_df[TARGET_FEATURE]

In [11]:
X_validation , y_validation = validation_df.drop(TARGET_FEATURE, axis = 1), validation_df[TARGET_FEATURE]

In [12]:
pipeline.fit(X_train)



In [13]:
X_train_transformed = pipeline.transform(X_train)



In [14]:
X_validation_transformed = pipeline.transform(X_validation)



# Linear Regression

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
lr = LinearRegression()

In [17]:
lr.fit(X_train_transformed,y_train)

In [18]:
predictions_train = lr.predict(X_train_transformed)

In [19]:
predictions_validation = lr.predict(X_validation_transformed)

In [20]:
root_mean_squared_error_train = np.sqrt(mean_squared_error(y_train,predictions_train))
root_mean_squared_error_validation = np.sqrt(mean_squared_error(y_validation,predictions_validation))

In [21]:
r2_train = r2_score(y_train,predictions_train)
r2_validation = r2_score(y_validation,predictions_validation)

In [22]:
print("Dataset\t\t\tRoot mean square error\t\t\tR square score")
print(f"Train\t\t\t{root_mean_squared_error_train}\t\t\t{r2_train}")
print(f"Validation\t\t{root_mean_squared_error_validation}\t\t\t{r2_validation}")

Dataset			Root mean square error			R square score
Train			6.278349052110622			0.5520717513626424
Validation		6.27864087962563			0.5547454231903701


# SVR

In [23]:
from sklearn.svm import LinearSVR

In [24]:
svr = LinearSVR()

In [25]:
svr.fit(X_train_transformed,y_train)

In [26]:
predictions_train = svr.predict(X_train_transformed)

In [27]:
predictions_validation = svr.predict(X_validation_transformed)

In [28]:
root_mean_squared_error_train = np.sqrt(mean_squared_error(y_train,predictions_train))
root_mean_squared_error_validation = np.sqrt(mean_squared_error(y_validation,predictions_validation))

In [29]:
r2_train = r2_score(y_train,predictions_train)
r2_validation = r2_score(y_validation,predictions_validation)

In [30]:
print("Dataset\t\t\tRoot mean square error\t\t\tR square score")
print(f"Train\t\t\t{root_mean_squared_error_train}\t\t\t{r2_train}")
print(f"Validation\t\t{root_mean_squared_error_validation}\t\t\t{r2_validation}")

Dataset			Root mean square error			R square score
Train			6.300598751601082			0.5488913201822972
Validation		6.302034677364084			0.5514212641128122


# Random Forests

In [31]:
from sklearn.ensemble import RandomForestRegressor

In [32]:
rf  =RandomForestRegressor()

In [33]:
rf.fit(X_train_transformed,y_train)

In [34]:
predictions_train = rf.predict(X_train_transformed)

In [35]:
predictions_validation = rf.predict(X_validation_transformed)

In [36]:
root_mean_squared_error_train = np.sqrt(mean_squared_error(y_train,predictions_train))
root_mean_squared_error_validation = np.sqrt(mean_squared_error(y_validation,predictions_validation))
r2_train = r2_score(y_train,predictions_train)
r2_validation = r2_score(y_validation,predictions_validation)

In [37]:
print("Dataset\t\t\tRoot mean square error\t\t\tR square score")
print(f"Train\t\t\t{root_mean_squared_error_train}\t\t\t{r2_train}")
print(f"Validation\t\t{root_mean_squared_error_validation}\t\t\t{r2_validation}")

Dataset			Root mean square error			R square score
Train			1.4962584071179037			0.9745592010976
Validation		3.9345400960650863			0.825150233558825


# Extra Trees

In [38]:
from sklearn.ensemble import ExtraTreesRegressor

In [39]:
extra = ExtraTreesRegressor()

In [40]:
extra.fit(X_train_transformed,y_train)

In [41]:
predictions_train = extra.predict(X_train_transformed)
predictions_validation = extra.predict(X_validation_transformed)

In [42]:
root_mean_squared_error_train = np.sqrt(mean_squared_error(y_train,predictions_train))
root_mean_squared_error_validation = np.sqrt(mean_squared_error(y_validation,predictions_validation))
r2_train = r2_score(y_train,predictions_train)
r2_validation = r2_score(y_validation,predictions_validation)

In [43]:
print("Dataset\t\t\tRoot mean square error\t\t\tR square score")
print(f"Train\t\t\t{root_mean_squared_error_train}\t\t\t{r2_train}")
print(f"Validation\t\t{root_mean_squared_error_validation}\t\t\t{r2_validation}")

Dataset			Root mean square error			R square score
Train			0.0			1.0
Validation		4.086817081745783			0.8113540414829488


# Ada Boost

In [44]:
from sklearn.ensemble import AdaBoostRegressor

In [45]:
ada = AdaBoostRegressor()

In [46]:
ada.fit(X_train_transformed,y_train)

In [47]:
predictions_train = ada.predict(X_train_transformed)
predictions_validation = ada.predict(X_validation_transformed)

In [48]:
root_mean_squared_error_train = np.sqrt(mean_squared_error(y_train,predictions_train))
root_mean_squared_error_validation = np.sqrt(mean_squared_error(y_validation,predictions_validation))
r2_train = r2_score(y_train,predictions_train)
r2_validation = r2_score(y_validation,predictions_validation)

In [49]:
print("Dataset\t\t\tRoot mean square error\t\t\tR square score")
print(f"Train\t\t\t{root_mean_squared_error_train}\t\t\t{r2_train}")
print(f"Validation\t\t{root_mean_squared_error_validation}\t\t\t{r2_validation}")

Dataset			Root mean square error			R square score
Train			6.024762946814549			0.5875251577651301
Validation		6.031818588874642			0.5890645118634115


# Gradient Boosting

In [50]:
from sklearn.ensemble import GradientBoostingRegressor

In [51]:
gb = GradientBoostingRegressor()

In [52]:
gb.fit(X_train_transformed,y_train)

In [53]:
predictions_train = gb.predict(X_train_transformed)
predictions_validation = gb.predict(X_validation_transformed)

In [54]:
root_mean_squared_error_train = np.sqrt(mean_squared_error(y_train,predictions_train))
root_mean_squared_error_validation = np.sqrt(mean_squared_error(y_validation,predictions_validation))
r2_train = r2_score(y_train,predictions_train)
r2_validation = r2_score(y_validation,predictions_validation)

In [55]:
print("Dataset\t\t\tRoot mean square error\t\t\tR square score")
print(f"Train\t\t\t{root_mean_squared_error_train}\t\t\t{r2_train}")
print(f"Validation\t\t{root_mean_squared_error_validation}\t\t\t{r2_validation}")

Dataset			Root mean square error			R square score
Train			4.503901567895414			0.7694870552053813
Validation		4.504968707833232			0.7707756959219245


# XGBoost

In [56]:
from xgboost import XGBRegressor, XGBRFRegressor

In [57]:
xgb = XGBRegressor()

In [58]:
xgb.fit(X_train_transformed,y_train)

In [59]:
predictions_train = xgb.predict(X_train_transformed)
predictions_validation = xgb.predict(X_validation_transformed)

In [60]:
root_mean_squared_error_train = np.sqrt(mean_squared_error(y_train,predictions_train))
root_mean_squared_error_validation = np.sqrt(mean_squared_error(y_validation,predictions_validation))
r2_train = r2_score(y_train,predictions_train)
r2_validation = r2_score(y_validation,predictions_validation)

In [61]:
print("Dataset\t\t\tRoot mean square error\t\t\tR square score")
print(f"Train\t\t\t{root_mean_squared_error_train}\t\t\t{r2_train}")
print(f"Validation\t\t{root_mean_squared_error_validation}\t\t\t{r2_validation}")

Dataset			Root mean square error			R square score
Train			3.416947020837422			0.8673234933093927
Validation		3.9623734388516314			0.8226676729693676


#### XGBRF

In [62]:
xgbrf = XGBRFRegressor()

In [63]:
xgbrf.fit(X_train_transformed,y_train)

In [64]:
predictions_train = xgbrf.predict(X_train_transformed)
predictions_validation = xgbrf.predict(X_validation_transformed)

In [65]:
root_mean_squared_error_train = np.sqrt(mean_squared_error(y_train,predictions_train))
root_mean_squared_error_validation = np.sqrt(mean_squared_error(y_validation,predictions_validation))
r2_train = r2_score(y_train,predictions_train)
r2_validation = r2_score(y_validation,predictions_validation)

In [66]:
print("Dataset\t\t\tRoot mean square error\t\t\tR square score")
print(f"Train\t\t\t{root_mean_squared_error_train}\t\t\t{r2_train}")
print(f"Validation\t\t{root_mean_squared_error_validation}\t\t\t{r2_validation}")

Dataset			Root mean square error			R square score
Train			5.327809706404458			0.6774367046231888
Validation		5.400644437231625			0.670566108369886


# Random Forest Looks the most promising

In [67]:
rf = RandomForestRegressor(n_estimators=200, min_samples_split= 6,max_depth=15)

In [68]:
#rf = RandomForestRegressor(n_estimators=100, min_samples_split= 6,max_depth=15, criterion='absolute_error',verbose=10, n_jobs=-1)

In [69]:
rf.fit(X_train_transformed,y_train)

In [70]:
predictions_train = rf.predict(X_train_transformed)
predictions_validation = rf.predict(X_validation_transformed)

In [71]:
root_mean_squared_error_train = np.sqrt(mean_squared_error(y_train,predictions_train))
root_mean_squared_error_validation = np.sqrt(mean_squared_error(y_validation,predictions_validation))
r2_train = r2_score(y_train,predictions_train)
r2_validation = r2_score(y_validation,predictions_validation)

In [72]:
print("Dataset\t\t\tRoot mean square error\t\t\tR square score")
print(f"Train\t\t\t{root_mean_squared_error_train}\t\t\t{r2_train}")
print(f"Validation\t\t{root_mean_squared_error_validation}\t\t\t{r2_validation}")

Dataset			Root mean square error			R square score
Train			2.949120068269579			0.9011668933800919
Validation		3.866532065057126			0.8311425067100264


# Hyperparameter Tuning

In [73]:
parameters = {
    "n_estimators":[50,100,200,500,1000],
    
    'min_samples_split':[2,4,6],
    'max_features':['int','sqrt','log2'],
    'bootstrap':[True,False],
}

In [74]:
from sklearn.model_selection import GridSearchCV


In [75]:
rf= RandomForestRegressor()

In [76]:
grid = GridSearchCV(rf, parameters,cv=3,verbose=10,n_jobs=-1, scoring='r2')

In [77]:
grid.fit(X_train_transformed,y_train)


Fitting 3 folds for each of 90 candidates, totalling 270 fits


90 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\rachi\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\rachi\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 476, in fit
    trees = Parallel(
  File "C:\Users\rachi\anaconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\rachi\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\rachi\anaco

In [78]:
grid.best_score_

0.8193349198404963

In [79]:
grid.best_params_

{'bootstrap': False,
 'max_features': 'log2',
 'min_samples_split': 6,
 'n_estimators': 1000}

In [80]:
best_model = grid.estimator

In [81]:
best_model.fit(X_train_transformed,y_train)

In [82]:
predictions_train = best_model.predict(X_train_transformed)
predictions_validation = best_model.predict(X_validation_transformed)

In [83]:
root_mean_squared_error_train = np.sqrt(mean_squared_error(y_train,predictions_train))
root_mean_squared_error_validation = np.sqrt(mean_squared_error(y_validation,predictions_validation))
r2_train = r2_score(y_train,predictions_train)
r2_validation = r2_score(y_validation,predictions_validation)

In [84]:
print("Dataset\t\t\tRoot mean square error\t\t\tR square score")
print(f"Train\t\t\t{root_mean_squared_error_train}\t\t\t{r2_train}")
print(f"Validation\t\t{root_mean_squared_error_validation}\t\t\t{r2_validation}")

Dataset			Root mean square error			R square score
Train			1.4966648875373574			0.9745453764918676
Validation		3.9228873323012765			0.8261843904404218


# Deep Learning

In [85]:
from tensorflow import keras

In [86]:
import tensorboard

In [87]:
%load_ext tensorboard

In [92]:
log_dir = os.path.join(base_url,tensorboard_logdir)

In [94]:
X_train_transformed.shape

(41033, 33)

In [95]:
model = keras.models.Sequential()

In [96]:
model.add(keras.layers.Input(shape=(33,)))

In [97]:
model.add(keras.layers.Dense(100,activation='selu',kernel_initializer="lecun_normal"))

In [98]:
model.add(keras.layers.Dense(100,activation='selu',kernel_initializer="lecun_normal"))

In [99]:
model.add(keras.layers.Dense(100,activation='selu',kernel_initializer="lecun_normal"))

In [100]:
model.add(keras.layers.Dense(100,activation='selu',kernel_initializer="lecun_normal"))
model.add(keras.layers.Dense(100,activation='selu',kernel_initializer="lecun_normal"))

In [101]:
model.add(keras.layers.Dense(1,activation='linear'))

In [102]:
tensorboard_cb = keras.callbacks.TensorBoard(log_dir)

In [103]:
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10,restore_best_weights=True)

In [104]:
model.compile(loss='mean_squared_error',optimizer='nadam')

In [105]:
model.fit(X_train_transformed,y_train, epochs=100, validation_data = (X_validation_transformed,y_validation), callbacks=[tensorboard_cb,early_stopping_cb])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100


<keras.callbacks.History at 0x25d0281aac0>

In [106]:
predictions_train = model.predict(X_train_transformed)
predictions_validation = model.predict(X_validation_transformed)

In [107]:
root_mean_squared_error_train = np.sqrt(mean_squared_error(y_train,predictions_train))
root_mean_squared_error_validation = np.sqrt(mean_squared_error(y_validation,predictions_validation))
r2_train = r2_score(y_train,predictions_train)
r2_validation = r2_score(y_validation,predictions_validation)

In [108]:
print("Dataset\t\t\tRoot mean square error\t\t\tR square score")
print(f"Train\t\t\t{root_mean_squared_error_train}\t\t\t{r2_train}")
print(f"Validation\t\t{root_mean_squared_error_validation}\t\t\t{r2_validation}")

Dataset			Root mean square error			R square score
Train			4.667903770625554			0.7523939074114505
Validation		4.941310118326769			0.7242209139285154


## Deep Learning not working well as well. Will use Clustering to give another shot