In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
df = pd.read_csv('Cleaned_Dataset/clean_dataset_v1.csv')
df.head()

Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time
0,ID_6321,"FTI College, Law College Road, Pune","Fast Food, Rolls, Burger, Salad, Wraps",200.0,50.0,3.5,12.0,4.0,30.0
1,ID_2882,"Sector 3, Marathalli","Ice Cream, Desserts",100.0,50.0,3.5,11.0,4.0,30.0
2,ID_1595,Mumbai Central,"Italian, Street Food, Fast Food",150.0,50.0,3.6,99.0,30.0,65.0
3,ID_5929,"Sector 1, Noida","Mughlai, North Indian, Chinese",250.0,99.0,3.7,176.0,95.0,30.0
4,ID_6123,"Rmz Centennial, I Gate, Whitefield","Cafe, Beverages",200.0,99.0,3.2,521.0,235.0,65.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11094 entries, 0 to 11093
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Restaurant     11094 non-null  object 
 1   Location       11094 non-null  object 
 2   Cuisines       11094 non-null  object 
 3   Average_Cost   11094 non-null  float64
 4   Minimum_Order  11094 non-null  float64
 5   Rating         11094 non-null  float64
 6   Votes          11094 non-null  float64
 7   Reviews        11094 non-null  float64
 8   Delivery_Time  11094 non-null  float64
dtypes: float64(6), object(3)
memory usage: 780.2+ KB


In [4]:
df.isnull().sum()

Restaurant       0
Location         0
Cuisines         0
Average_Cost     0
Minimum_Order    0
Rating           0
Votes            0
Reviews          0
Delivery_Time    0
dtype: int64

In [5]:
from feature_engine import categorical_encoders as ce

encoder = ce.CountFrequencyCategoricalEncoder(encoding_method = 'count',
                                             variables = ['Cuisines', 'Location'])
df_encoded = encoder.fit_transform(df)

In [6]:
final_df = df_encoded.drop(columns = ['Restaurant'])
final_df.head()

Unnamed: 0,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time
0,584,1,200.0,50.0,3.5,12.0,4.0,30.0
1,366,61,100.0,50.0,3.5,11.0,4.0,30.0
2,459,3,150.0,50.0,3.6,99.0,30.0,65.0
3,791,16,250.0,99.0,3.7,176.0,95.0,30.0
4,457,14,200.0,99.0,3.2,521.0,235.0,65.0


In [7]:
X = final_df.iloc[:, :-1]
y = final_df.iloc[:, -1]

In [8]:
q1 = final_df.quantile(0.25)
q3 = final_df.quantile(0.75)
iqr = q3 - q1
iqr

Location         258.000000
Cuisines         164.000000
Average_Cost     100.000000
Minimum_Order      0.000000
Rating             0.400000
Votes            222.870690
Reviews          110.674536
Delivery_Time     15.000000
dtype: float64

In [9]:
outlier_df = final_df[~((final_df < (q1 - 1.5*iqr)) | (final_df > (q3 + 1.5*iqr))).any(axis = 1)]
outlier_df.head()

Unnamed: 0,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time
0,584,1,200.0,50.0,3.5,12.0,4.0,30.0
1,366,61,100.0,50.0,3.5,11.0,4.0,30.0
2,459,3,150.0,50.0,3.6,99.0,30.0,65.0
5,457,66,150.0,50.0,3.8,46.0,18.0,30.0
6,459,53,150.0,50.0,3.7,108.0,31.0,30.0


In [10]:
len(outlier_df)

7054

In [11]:
def TrainTestSplit(df):
    from sklearn.model_selection import train_test_split
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    return train_test_split(X, y, test_size = 0.3, random_state = 7)

In [12]:
def PrintError(pred, test):
    errors = abs(pred - test)
    print('Mean Absolute Error: ', round(np.mean(errors), 2))
    
    mape = 100 * errors/test
    accuracy = 100 - np.mean(mape)
    print('Accuracy :', round(accuracy, 2))

In [13]:
def RandomForest(df):
    from sklearn.ensemble import RandomForestRegressor
    X_train, X_test, y_train, y_test = TrainTestSplit(df)

    rf = RandomForestRegressor(n_estimators= 1800, min_samples_split= 2, min_samples_leaf= 1, max_features= 'sqrt', max_depth= None, bootstrap= False, random_state = 7)
    rf.fit(X_train, y_train)
    
    # errors
    y_pred = rf.predict(X_test)
    PrintError(y_pred, y_test)
    
    return rf

In [14]:
rf = RandomForest(outlier_df)

Mean Absolute Error:  4.66
Accuracy : 88.01


In [17]:
X_train, X_test, y_train, y_test = TrainTestSplit(outlier_df)

In [18]:
pred = rf.predict(X_test)

In [19]:
pred

array([30.30277778, 31.3375    , 31.66805556, ..., 42.38611111,
       42.52875817, 37.23333333])

In [20]:
outlier_df.iloc[:, 7].value_counts()

30.0    5211
45.0    1406
65.0     420
20.0      15
10.0       2
Name: Delivery_Time, dtype: int64

In [23]:
def predictions(arr):
    for i in range(len(arr)):
        if(arr[i] < 15):
            arr[i] = 10
        elif(arr[i] >= 15 and arr[i] < 25):
            arr[i] = 20
        elif(arr[i] >= 25 and arr[i] < 38):
            arr[i] = 30
        elif(arr[i] >= 38 and arr[i] < 55):
            arr[i] = 45
        elif(arr[i] >= 55 and arr[i] < 93):
            arr[i] = 65
        else:
            arr[i] = 120
    return arr

In [24]:
new_pred = predictions(pred)

In [25]:
new_pred

array([30., 30., 30., ..., 45., 45., 30.])

In [27]:
new_df = pd.DataFrame(new_pred)

In [28]:
new_df.head()

Unnamed: 0,0
0,30.0
1,30.0
2,30.0
3,45.0
4,30.0


In [30]:
new_df[0].value_counts()

30.0    1553
45.0     533
65.0      29
20.0       2
Name: 0, dtype: int64

In [34]:
test = pd.DataFrame(y_test)

In [37]:
test['Delivery_Time'].value_counts()

30.0    1568
45.0     408
65.0     139
20.0       2
Name: Delivery_Time, dtype: int64

In [38]:
PrintError(pred, y_test)

Mean Absolute Error:  3.63
Accuracy : 91.34
