In [None]:
! pip install feature_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature_engine
  Downloading feature_engine-1.5.2-py2.py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.0/290.0 KB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature_engine
Successfully installed feature_engine-1.5.2


In [None]:
import ast
import re
import time
from typing import List, Optional, Union, Dict
import pandas as pd
import numpy as np
from google.colab import drive

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
 
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer,AddMissingIndicator,CategoricalImputer
from feature_engine.transformation import LogTransformer
from feature_engine.encoding import OrdinalEncoder,OneHotEncoder


from sklearn.base import BaseEstimator, TransformerMixin

# to build the models
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings('ignore')

In [None]:
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [None]:
class handleRankingLabels(BaseEstimator, TransformerMixin):
  def __init__(self,variables: Union[None, str, List[str]] = None,target:str=None,tol:Union[None, int, List[int]]=10):
        self.variables = variables
        self.listed_in_ranks:Union[None,List[Union[str, Dict[Union[str, int], List[Union[str, int]]]]]] = None
        self.tol = tol
        self.target = target
        self.listed_in_dict:Union[None,Dict,List[Union[str, Dict[Union[str, int], List[Union[str, int]]]]]] = None

        if (isinstance(self.variables,list) and isinstance(self.tol,list)) and (len(self.variables) != len(self.tol)):
          raise Exception("Number of variable and number of tolerance should be equal in length, check the varables and tol!!")

  
  # This method checks relation with target columns based on that montonic relation assign ordinal values to top 10 features and map those feature on train and test datasets 
  def fit(self,X:pd.DataFrame,y:pd.Series = None):
    X[self.target] = y
    X[self.target] = X[self.target].fillna(X[self.target].median())
       
    if X[self.target].dtypes != float:
      X[self.target] = X[self.target].str.replace(',','').astype(float) 

    # If user provide multiple variable this will find the top monotonic cardinal values of that variable with target variable and stores those top values.
    if isinstance(self.variables,list):
      self.listed_in_ranks = [{f'{self.variables[index]}':list(X.groupby(self.variables[index])[self.target].mean().sort_values(ascending=False)[:self.tol[index]].to_dict().keys())}
                              if len(df[self.variables[index]].unique()) > 10 
                              else {f'{self.variables[index]}':list(X.groupby(self.variables[index])[self.target].mean().sort_values(ascending=False).to_dict().keys())} 
                              for index in range(0,len(self.variables))]

    # If user provide single variable this will find the top monotonic cardinal values of that variable with target variable and stores that top values.
    if isinstance(self.variables,str):
      self.listed_in_ranks = self.listed_in_ranks = list(X.groupby(self.variables)[self.target].mean().sort_values(ascending=False)[:self.tol].to_dict().keys()) if len(df[self.variables].unique()) > 10 else list(X.groupby(self.variables)[self.target].mean().sort_values(ascending=False).to_dict().keys())
       
    return self

  def transform(self,X):

    if isinstance(self.variables,list):
      self.listed_in_dict = list()
      list_in_dict = dict()
      for index in range(0,len(self.variables)):
        # replacing non top categories  
        X[self.variables[index]] = X[self.variables[index]].apply(lambda value:value if value in self.listed_in_ranks[index][self.variables[index]] else 'Rare')   
              
        self.listed_in_ranks[index][self.variables[index]] = self.listed_in_ranks[index][self.variables[index]]+ ['Rare']
        # # Creating dictionary for mapping categories 
        for indx in range(0,len(self.listed_in_ranks[index][self.variables[index]])):
           list_in_dict[self.listed_in_ranks[index][self.variables[index]][indx]] = indx
        self.listed_in_dict.append({self.variables[index]:list_in_dict})

        list_in_dict = dict()
        # # replacing categories
        X[self.variables[index]] = X[self.variables[index]].map(self.listed_in_dict[index][self.variables[index]]) 
    
    if isinstance(self.variables,str):
      self.listed_in_dict = dict()
      # replacing non top categories  
      X[self.variables] = X[self.variables].apply(lambda value:value if value in self.listed_in_ranks else 'Rare')   
            
      self.listed_in_ranks = self.listed_in_ranks + ['Rare']
      # # Creating dictionary for mapping categories 
      for index in range(0,len(self.listed_in_ranks)):
        self.listed_in_dict[self.listed_in_ranks[index]] = index

      # # replacing categories
      X[self.variables] = X[self.variables].map(self.listed_in_dict) 

    return X

In [None]:
class handleMixLabels(BaseEstimator, TransformerMixin):
  def __init__(self,variables: Union[None, str, List[str]] = None,target:str=None,tol:Union[None, int, List[int]]=10):
    self.variables = variables
    self.keys:Union[None,List[Union[str, Dict[Union[str, int], List[Union[str, int]]]]]] = None
    self.tol = tol
    self.target = target
    self.all_unique_feature_type:Union[None,List[Union[str, Dict[Union[str, int], List[Union[str, int]]]]]] = None
    self.top_unique_feature_type_:Union[None,List[Union[str, Dict[Union[str, int], List[Union[str, int]]]]]] = None
    self.unique_rare_feature_type_:Union[None,List[Union[str, Dict[Union[str, int], List[Union[str, int]]]]]] = None



  # we check the top 10 cardinal value which have monotonic relation with target column store that in keys
  # go through the column if it have mixed or comma seperated values the carefully store all unique cardinal values store in all_unique_feature_type
  # based on tolerance it take top cardinal values default value is 10 and store those top values in top_unique_feature_type_
  def fit(self, X: pd.DataFrame,y:pd.Series):
          X = X.dropna().copy()
          X[self.target] = y
          X[self.target] = X[self.target].fillna(X[self.target].median())
        
          if isinstance(self.variables,list):
            self.all_unique_feature_type = list()
            self.top_unique_feature_type_ = list()
            self.unique_rare_feature_type_ = list()

            for index, variable in enumerate(self.variables): 
              # fetch all top categories of variable related to target variables in descending order 
              self.top_unique_feature_type_.append({variable:list(np.unique(sum(pd.Series([key for key in X.groupby(variable)[self.target].mean().sort_values(ascending=False).keys()][:self.tol[index]]).str.split(r'(?:,|;)\s*').dropna().to_numpy(), []))) + ['Rare']})

              # remove duplicated categories as they are all mixed up
              self.all_unique_feature_type.append({variable:list(np.unique(sum(X[variable].str.split(r'(?:,|;)\s*').dropna().to_numpy(), [])))}) 
              
              # Seperate less related unique categories from top categorials for rare labeling
              self.unique_rare_feature_type_.append({variable:[item for item in self.all_unique_feature_type[index][variable] if item not in self.top_unique_feature_type_[index][variable]]})
            
              
          if isinstance(self.variables,str):
            self.all_unique_feature_type = list()
            self.top_unique_feature_type_ = list()
            self.unique_rare_feature_type_ = list()  

            # fetch all top categories of variable related to target variables in descending order 
            self.top_unique_feature_type_ = list(np.unique(sum(pd.Series([key for key in X.groupby(self.variables)[self.target].mean().sort_values(ascending=False).keys()][:self.tol]).str.split(r'(?:,|;)\s*').dropna().to_numpy(), [])))  + ['Rare']       

            # remove duplicated categories as they are all mixed up
            self.all_unique_feature_type.append({self.variables:list(np.unique(sum(X[self.variables].str.split(r'(?:,|;)\s*').dropna().to_numpy(), [])))}) 
              
            # Seperate less related unique categories from top categorials for rare labeling
            self.unique_rare_feature_type_.append({self.variables:[item for item in self.all_unique_feature_type if item not in self.top_unique_feature_type_]})

          return self



  def transform(self, X: pd.DataFrame):
        X = X.copy()
        
        if isinstance(self.variables,list):
          for index, variable in enumerate(self.variables):
            # adding new columns of unique labels in datasets
            for value in self.top_unique_feature_type_[index][variable]:
                 X[f'{variable}_{value}'] = np.where(X[variable].isin([value]),1,0)

            print(np.unique(np.where(X[variable].isin(self.unique_rare_feature_type_[index][variable]),1,0), return_counts=True))
            print(np.unique( np.where(X[variable].str.findall('|'.join(self.unique_rare_feature_type_[index][variable])).str.len()>0,1,0),return_counts=True))
 
            X[f'{variable}_Rare'] = np.where(X[variable].str.findall('|'.join(self.unique_rare_feature_type_[index][variable])).str.len()>0,1,0)    

        if isinstance(self.variables,str):  
            for value in self.top_unique_feature_type_:
                 X[f'{self.variables}_{value}'] = np.where(X[self.variables].isin([value]),1,0)

            print(np.unique(np.where(X[self.variables].isin(self.unique_rare_feature_type_),1,0), return_counts=True))
            print(np.unique( np.where(X[self.variables].str.findall('|'.join(self.unique_rare_feature_type_)).str.len()>0,1,0),return_counts=True))
 
            X[f'{self.variables}_Rare'] = np.where(X[self.variables].str.findall('|'.join(self.unique_rare_feature_type_)).str.len()>0,1,0)
       
        X = X.drop(self.variables,axis=1)
        return X

In [None]:
class featureSelection(BaseEstimator, TransformerMixin):
  def __init__(self,alpha,random_state):
    self.selected_feats = None
    self.__alpha = alpha
    self.__random_state = random_state 
    self.sel_ = None

  def fit(self,X,y):
   self.sel_ = SelectFromModel(Lasso(alpha=self.__alpha, random_state=self.__random_state))
   self.sel_.fit(X, y)  
   return self

  def transform(self,X): 
   self.selected_feats = X.columns[(self.sel_.get_support())]
   return X[self.selected_feats]

In [None]:
class ZomatoModelTrain(handleMixLabels,handleRankingLabels,featureSelection):

  def __init__(self,df):
    self.__df = df
    self.__target = 'approx_cost(for two people)'
    self.__df[self.__target] = self.__df[self.__target].str.replace(',','').astype(float)
    self.__random_state = 100
    self.__alpha = 0.001
    self.__test_size = 0.33
    self.__handleRankingLabels = handleRankingLabels
    self.__handleMixLabels = handleMixLabels
    self.__featureSelection = featureSelection
    self.__variable_to_drop = ['url','address','phone','reviews_list','name','dish_liked','menu_item'] + [self.__target]
    self.__AddMissingIndicatorVariables = ['votes','rate']
    self.__MeanMedianImputerVarables = ['votes']
    self.__CategoricalImputerModeVarables = ['rate','cuisines','rest_type']
    self.__LogTransformerVarables = ['votes']
    self.__OrdinalEncoderVariables = ['rate']
    self.__OneHotEncoderVariables = ['online_order','book_table']
    self.__handleRankingLabels_var = {'variables':['listed_in(type)','listed_in(city)','location'],'tolerance':[10,15,15]}
    self.__handleMixLabels_var ={'variables':['rest_type','cuisines'],'tolerance':[10,15]}


  def applyModelTrain(self):
    X_train,X_test,y_train,y_test = self.__dataSpliter()
    X_train,X_test = self.__dataCleanar(X_train,X_test)  
    return X_train,X_test,y_train.fillna(y_train.median()),y_test.fillna(y_test.median())

  # Splits the data into train and test set
  def __dataSpliter(self):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(self.__variable_to_drop,axis=1), df[self.__target], test_size=self.__test_size, random_state=42)
    X_train,X_test,y_train,y_test = X_train.reset_index().drop('index',axis=1),X_test.reset_index().drop('index',axis=1),y_train.reset_index().drop('index',axis=1),y_test.reset_index().drop('index',axis=1)
    X_train,X_test = self.__dataCleanar(X_train,X_test)
    return X_train,X_test,y_train,y_test

  # This method andle noisy data from rate and votes columns
  def __dataCleanar(self,X_train,X_test):
    # replacing '-' with nan in rate variable
    X_train['rate'] = X_train['rate'].replace('-',np.nan)  
    X_test['rate'] = X_test['rate'].replace('-',np.nan)

    # replacing '0' with nan in votes variable
    X_train['votes'] = X_train['votes'].replace(0,np.nan)
    X_test['votes'] = X_test['votes'].replace(0,np.nan)
    
    # replacing '/5' with ' in rates variable
    X_train['rate'] = X_train['rate'].apply(lambda value:str(value).replace('/5',''))
    X_test['rate'] = X_test['rate'].apply(lambda value:str(value).replace('/5',''))
    
    X_train['rate'] = X_train['rate'].apply(lambda value: 0 if value == 'NEW' else value).astype(float)
    X_test['rate'] = X_test['rate'].apply(lambda value: 0 if value == 'NEW' else value).astype(float)

    return X_train,X_test

  # This method is for rate column where based on certain ranges we made this column to categorical oridinal variable
  def handleRating(self,X_train,X_test,feature):
    return np.where(X_train[feature]==np.nan,np.nan,np.where(X_train[feature]==0,'New',np.where(X_train[feature]<2.5,'Poor',np.where((X_train[feature]>2.5) | (X_train[feature]<3.5),'Average','Good')))),np.where(X_test[feature]==np.nan,np.nan,np.where(X_test[feature]==0,'New',np.where(X_test[feature]<2.5,'Poor',np.where((X_test[feature]>2.5) | (X_test[feature]<3.5),'Average','Good')))) 

  def featurePipeline(self):
    pipe = Pipeline([
      #  Missing indicator
      ('Add missing indicator',AddMissingIndicator(
          variables=self.__AddMissingIndicatorVariables)),
    
      #   Median Missing Imputation
      ('Median Missing Imputation',MeanMedianImputer(
          imputation_method='median', variables=self.__MeanMedianImputerVarables)),

      #   Mode Missing Imputation
      ('Mode Missing Imputation',CategoricalImputer(
          imputation_method='frequent', variables=self.__CategoricalImputerModeVarables)),        

      # Feature Transformation
      ('LogTransformer',LogTransformer(
          variables=self.__LogTransformerVarables)),

      #  Ordinal Encoder
      ('OrdinalEncoder',OrdinalEncoder(
          encoding_method='ordered',variables=self.__OrdinalEncoderVariables)),

      #  OneHotEncoder
      ('OneHotEncoder',OneHotEncoder(
          drop_last=True,variables=self.__OneHotEncoderVariables)),

      # handleRankingLabels
      ('handleRankingLabels listed_in(type)',self.__handleRankingLabels(
          variables=self.__handleRankingLabels_var['variables'],tol=self.__handleRankingLabels_var['tolerance'],target=self.__target)),        

      #  handleMixLabels Imputation
      ('handleMixLabels Imputation cuisines', self.__handleMixLabels(
          variables=self.__handleMixLabels_var['variables'],tol=self.__handleMixLabels_var['tolerance'],target=self.__target)), 

      # #  feature selection
      # ('feature selection',featureSelection(alpha=self.__alpha, random_state=self.__random_state)),

      # Model
      # ('Random Forest Model',RandomForestRegressor(bootstrap= True,max_depth= 10,min_samples_leaf= 2,min_samples_split= 2,n_estimators= 100,oob_score= True))
    ])

    return pipe


In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/zomato.csv')

In [None]:
zmt = ZomatoModelTrain(df)

In [None]:
X_train,X_test,y_train,y_test = zmt.applyModelTrain()

In [None]:
X_train = X_train.dropna().reset_index().drop('index',axis=1)
y_train = y_train.dropna().reset_index().drop('index',axis=1)[:X_train.shape[0]]

In [None]:
X_train.shape,y_train.shape

((27812, 9), (27812, 1))

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27812 entries, 0 to 27811
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   online_order     27812 non-null  object 
 1   book_table       27812 non-null  object 
 2   rate             27812 non-null  float64
 3   votes            27812 non-null  float64
 4   location         27812 non-null  object 
 5   rest_type        27812 non-null  object 
 6   cuisines         27812 non-null  object 
 7   listed_in(type)  27812 non-null  object 
 8   listed_in(city)  27812 non-null  object 
dtypes: float64(2), object(7)
memory usage: 1.9+ MB


In [None]:
X_train['rate'],X_test['rate'] = zmt.handleRating(X_train,X_test,'rate')

In [None]:
X_train['rate'].unique(),X_test['rate'].unique()

(array(['Average', 'Poor'], dtype=object),
 array(['Average', 'New', 'Good', 'Poor'], dtype=object))

In [None]:
pipe = zmt.featurePipeline()

In [None]:
pipe = pipe.fit(X_train,y_train)

In [None]:
# record start time
start = time.time()

train = pipe.transform(X_train)

# record end time
end = time.time()
 
# print the difference between start
# and end time in milli. secs
print("The time of execution of above program is :",
      (end-start) * 10**3, "ms")

(array([0, 1]), array([25370,  2442]))
(array([0, 1]), array([23106,  4706]))
(array([0, 1]), array([26706,  1106]))
(array([0, 1]), array([15701, 12111]))
The time of execution of above program is : 472.80359268188477 ms


In [None]:
train.sample(5)

Unnamed: 0,rate,votes,location,listed_in(type),listed_in(city),online_order_Yes,book_table_No,rest_type_Bakery,rest_type_Beverage Shop,rest_type_Cafe,rest_type_Casual Dining,rest_type_Dessert Parlor,rest_type_Fine Dining,rest_type_Food Court,rest_type_Meat Shop,rest_type_Microbrewery,rest_type_Pub,rest_type_Quick Bites,rest_type_Rare,cuisines_Arabian,cuisines_BBQ,cuisines_Bakery,cuisines_Beverages,cuisines_Biryani,cuisines_Cafe,cuisines_Chettinad,cuisines_Chinese,cuisines_Continental,cuisines_Desserts,cuisines_Fast Food,cuisines_French,cuisines_German,cuisines_Ice Cream,cuisines_Italian,cuisines_Juices,cuisines_Middle Eastern,cuisines_Momos,cuisines_North Indian,cuisines_Oriya,cuisines_Rolls,cuisines_Seafood,cuisines_South Indian,cuisines_Spanish,cuisines_Steak,cuisines_Thai,cuisines_Turkish,cuisines_Rare
15464,1,2.890372,16,3,14,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22469,1,6.862758,16,5,7,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
17710,1,4.248495,16,5,16,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
16709,1,4.442651,16,5,16,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7652,1,5.135798,16,3,16,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [None]:
train['rest_type_Bakery'].value_counts()

0    27341
1      471
Name: rest_type_Bakery, dtype: int64

In [None]:
train['rest_type_Rare'].value_counts()

0    23106
1     4706
Name: rest_type_Rare, dtype: int64

In [None]:
train['cuisines_Rare'].value_counts()

0    15701
1    12111
Name: cuisines_Rare, dtype: int64

In [None]:
fs = featureSelection(alpha=0.01,random_state=100)

In [None]:
fs = fs.fit(train,y_train)

In [None]:
train = fs.transform(train)

In [None]:
rf = RandomForestRegressor(bootstrap= True,max_depth= 10,min_samples_leaf= 2,min_samples_split= 2,n_estimators= 100,oob_score= True)

In [None]:
rf = rf.fit(train,y_train)

In [None]:
pred = rf.predict(train)

In [None]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [None]:
r2_score(y_train,pred)

0.06697711978959486

In [None]:
np.sqrt(mean_squared_error(y_train,pred))

425.49313257413894

In [None]:
mean_absolute_error(y_train,pred)

291.17468702030015

In [None]:
residual_train = np.array(y_train)

In [None]:
# record start time
start = time.time()

test = pipe.transform(X_test)


# record end time
end = time.time()
 
# print the difference between start
# and end time in milli. secs
print("The time of execution of above program is :",
      (end-start) * 10**3, "ms")

(array([0, 1]), array([15432,  1635]))
(array([0, 1]), array([14038,  3029]))
(array([0, 1]), array([16443,   624]))
(array([0, 1]), array([10229,  6838]))
The time of execution of above program is : 520.2810764312744 ms


In [None]:
test = fs.transform(test)

In [None]:
test.isnull().sum()

rate                        3330
votes                          0
location                       0
listed_in(type)                0
listed_in(city)                0
online_order_Yes               0
book_table_No                  0
rest_type_Bakery               0
rest_type_Beverage Shop        0
rest_type_Cafe                 0
rest_type_Casual Dining        0
rest_type_Dessert Parlor       0
rest_type_Fine Dining          0
rest_type_Food Court           0
rest_type_Microbrewery         0
rest_type_Pub                  0
rest_type_Quick Bites          0
rest_type_Rare                 0
cuisines_Arabian               0
cuisines_BBQ                   0
cuisines_Beverages             0
cuisines_Biryani               0
cuisines_Cafe                  0
cuisines_Chettinad             0
cuisines_Chinese               0
cuisines_Continental           0
cuisines_Desserts              0
cuisines_Fast Food             0
cuisines_Ice Cream             0
cuisines_Italian               0
cuisines_N

In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17067 entries, 0 to 17066
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   rate                      13737 non-null  float64
 1   votes                     17067 non-null  float64
 2   location                  17067 non-null  int64  
 3   listed_in(type)           17067 non-null  int64  
 4   listed_in(city)           17067 non-null  int64  
 5   online_order_Yes          17067 non-null  int64  
 6   book_table_No             17067 non-null  int64  
 7   rest_type_Bakery          17067 non-null  int64  
 8   rest_type_Beverage Shop   17067 non-null  int64  
 9   rest_type_Cafe            17067 non-null  int64  
 10  rest_type_Casual Dining   17067 non-null  int64  
 11  rest_type_Dessert Parlor  17067 non-null  int64  
 12  rest_type_Fine Dining     17067 non-null  int64  
 13  rest_type_Food Court      17067 non-null  int64  
 14  rest_t

In [None]:
pred_test = rf.predict(test)

ValueError: ignored

In [None]:
r2_score(y_test,pred_test)

In [None]:
np.sqrt(mean_squared_error(y_test,pred_test))

In [None]:
mean_absolute_error(y_test,pred_test)