<a href="https://colab.research.google.com/github/SarkarPriyanshu/Machine-Learning-Models/blob/main/Zomato_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install feature_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature_engine
  Downloading feature_engine-1.5.2-py2.py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.0/290.0 KB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature_engine
Successfully installed feature_engine-1.5.2


In [2]:
import ast
import re
from typing import List, Optional, Union, Dict
import pandas as pd
import numpy as np
from google.colab import drive

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
 
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer,AddMissingIndicator,CategoricalImputer
from feature_engine.transformation import LogTransformer
from feature_engine.encoding import OrdinalEncoder,OneHotEncoder


from sklearn.base import BaseEstimator, TransformerMixin

# to build the models
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings('ignore')

In [3]:
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [67]:
class handleMixLabels(BaseEstimator, TransformerMixin):
  def __init__(self,variables: Union[None, str, List[str]] = None,target:str=None,tol:Union[None, int, List[int]]=10):
    self.variables = variables
    self.keys:Union[None,List[Union[str, Dict[Union[str, int], List[Union[str, int]]]]]] = None
    self.tol = tol
    self.target = target
    self.all_unique_feature_type:Union[None,List[Union[str, Dict[Union[str, int], List[Union[str, int]]]]]] = None
    self.top_unique_feature_type_:Union[None,List[Union[str, Dict[Union[str, int], List[Union[str, int]]]]]] = None
    self.unique_rare_feature_type_:Union[None,List[Union[str, Dict[Union[str, int], List[Union[str, int]]]]]] = None



  # we check the top 10 cardinal value which have monotonic relation with target column store that in keys
  # go through the column if it have mixed or comma seperated values the carefully store all unique cardinal values store in all_unique_feature_type
  # based on tolerance it take top cardinal values default value is 10 and store those top values in top_unique_feature_type_
  def fit(self, X: pd.DataFrame,y:pd.Series):
        X = X.dropna().copy()
        X[self.target] = y
        X[self.target] = X[self.target].fillna(X[self.target].median())
       
        if X[self.target].dtypes != float:
          X[self.target] = X[self.target].str.replace(',','').astype(float)

        if isinstance(self.variables,list):
          self.keys = list()
          variable_keys = list()
          self.all_unique_feature_type = list()
          self.top_unique_feature_type_ = list()
          self.unique_rare_feature_type_ = list()

          for index in range(0,len(self.variables)):

            # fetch all top categories of variable related to target variables in descending order 
            for key in X.groupby(self.variables[index])[self.target].mean().sort_values(ascending=False).keys():
                    variable_keys.append(key)
            self.keys.append({self.variables[index]:variable_keys})
            variable_keys = list()

            # remove duplicated categories as they are all mixed up
            self.all_unique_feature_type.append({self.variables[index]:list()})
           
            # collect all unique categories from variables   
            for value in X[self.variables[index]]:
              if ',' in value:
                for item in value.split(','):
                  if item.strip() not in self.all_unique_feature_type[index][self.variables[index]]:
                    self.all_unique_feature_type[index][self.variables[index]].append(item.strip())
              else:
                if value.strip() not in self.all_unique_feature_type[index][self.variables[index]]:
                  self.all_unique_feature_type[index][self.variables[index]].append(value.strip())  
            
            self.top_unique_feature_type_.append({self.variables[index]:list()})

            # Seperate less related unique categories from top categorials for rare labeling
            for value in self.keys[index][self.variables[index]][:self.tol[index]]:
              if isinstance(value,tuple):
                for item in value:
                  if item.strip() not in self.top_unique_feature_type_[index][self.variables[index]]:
                      self.top_unique_feature_type_[index][self.variables[index]].append(item.strip())
              elif ',' in value:
                  for item in value.split(','):
                    if item.strip() not in self.top_unique_feature_type_[index][self.variables[index]]:
                      self.top_unique_feature_type_[index][self.variables[index]].append(item.strip())
              else:
                if value.strip() not in self.top_unique_feature_type_[index][self.variables[index]]:
                  self.top_unique_feature_type_[index][self.variables[index]].append(value.strip())

            self.top_unique_feature_type_[index][self.variables[index]] = self.top_unique_feature_type_[index][self.variables[index]] + ['Rare']
            
            self.unique_rare_feature_type_.append({self.variables[index]:list()})
            self.unique_rare_feature_type_[index][self.variables[index]] = [value for value in self.all_unique_feature_type[index][self.variables[index]] if value not in self.top_unique_feature_type_[index][self.variables[index]]]

            
        if isinstance(self.variables,str):
          self.keys = list()
          self.all_unique_feature_type = list()
          self.top_unique_feature_type_ = list()
          self.unique_rare_feature_type_ = list()  

          # fetch all top categories of variable related to target variables in descending order 
          for key in X.groupby(self.variables)[self.target].mean().sort_values(ascending=False).keys():
                    self.keys.append(key)
           
          # collect all unique categories from variables   
          for value in X[self.variables]:
              if ',' in value:
                for item in value.split(','):
                  if item.strip() not in self.all_unique_feature_type:
                    self.all_unique_feature_type.append(item.strip())
              else:
                if value.strip() not in self.all_unique_feature_type:
                  self.all_unique_feature_type.append(value.strip())  
            

          # Seperate less related unique categories from top categorials for rare labeling
          for value in self.keys[:self.tol]:
              if isinstance(value,tuple):
                for item in value:
                  if item.strip() not in self.top_unique_feature_type_:
                      self.top_unique_feature_type_.append(item.strip())
              elif ',' in value:
                  for item in value.split(','):
                    if item.strip() not in self.top_unique_feature_type_:
                      self.top_unique_feature_type_.append(item.strip())
              else:
                if value.strip() not in self.top_unique_feature_type_:
                  self.top_unique_feature_type_.append(value.strip())

          self.top_unique_feature_type_ = self.top_unique_feature_type_ + ['Rare']  
          self.unique_rare_feature_type_ = [value for value in self.all_unique_feature_type if value not in self.top_unique_feature_type_]


        return self



  def transform(self, X: pd.DataFrame):
        X = X.copy()
        
        if isinstance(self.variables,list):
          for index in range(0,len(self.variables)):
            # adding new columns of unique labels in datasets
            for value in self.top_unique_feature_type_[index][self.variables[index]]:
              X[f'{self.variables[index]}_{value}'] = np.zeros(X.shape[0])
                    
              X[f'{self.variables[index]}_{value}'].astype(int)   

            # Adding 1 and 0's to those newly added columns
            for value in self.top_unique_feature_type_[index][self.variables[index]]:
              for indx in range(0,X.shape[0]):
                if value in X[self.variables[index]][indx]:
                  X[f'{self.variables[index]}_{value}'][indx] = 1
                      
            for value in self.unique_rare_feature_type_[index][self.variables[index]]:
              for indx in range(0,X.shape[0]):
                if value in X[self.variables[index]][indx]:
                  X[f'{self.variables[index]}_Rare'][indx] = 1




          for index in range(0,len(self.variables)):
            # adding new columns of unique labels in datasets
            for value in self.top_unique_feature_type_[index][self.variables[index]]:
              X[f'{self.variables[index]}_{value}'] = np.zeros(X.shape[0])

            # Adding 1 and 0's to those newly added columns
            for value in self.top_unique_feature_type_[index][self.variables[index]]:
              for indx in range(0,X.shape[0]):
                if  ',' in  X[self.variables[index]][indx]:
                  for item in X[self.variables[index]][indx].split(','):
                    if item.strip() not in self.unique_rare_feature_type_[index][self.variables[index]]:
                        X[f'{self.variables[index]}_{value}'][indx] = 1
                    if item.strip() in self.unique_rare_feature_type_[index][self.variables[index]]:
                      X[f'{self.variables[index]}_Rare'][indx] = 1
                else:
                  if value.strip() == X[self.variables[index]][indx].strip():
                        X[f'{self.variables[index]}_{value}'][indx] = 1
                  if X[self.variables[index]][indx].strip() in self.unique_rare_feature_type_[index][self.variables[index]]:
                      X[f'{self.variables[index]}_Rare'][indx] = 1

        if isinstance(self.variables,str):  
         # adding new columns of unique labels in datasets
          for value in self.top_unique_feature_type_:
            X[f'{self.variables}_{value}'] = np.zeros(X.shape[0])
                  
            X[f'{self.variables}_{value}'].astype(int)   

          # Adding 1 and 0's to those newly added columns
          for value in self.top_unique_feature_type_:
            for index in range(0,X.shape[0]):
              if value in X[self.variables][index]:
                X[f'{self.variables}_{value}'][index] = 1
                    
          for value in self.unique_rare_feature_type_:
            for index in range(0,X.shape[0]):
              if value in X[self.variables][index]:
                X[f'{self.variables}_Rare'][index] = 1
       
        X = X.drop(self.variables,axis=1)
        return X

In [68]:
class handleRankingLabels(BaseEstimator, TransformerMixin):
  def __init__(self,variables: Union[None, str, List[str]] = None,target:str=None,tol:Union[None, int, List[int]]=10):
        self.variables = variables
        self.listed_in_ranks:Union[None,List[Union[str, Dict[Union[str, int], List[Union[str, int]]]]]] = None
        self.tol = tol
        self.target = target
        self.listed_in_dict:Union[None,Dict,List[Union[str, Dict[Union[str, int], List[Union[str, int]]]]]] = None

        if (isinstance(self.variables,list) and isinstance(self.tol,list)) and (len(self.variables) != len(self.tol)):
          raise Exception("Number of variable and number of tolerance should be equal in length, check the varables and tol!!")

  
  # This method checks relation with target columns based on that montonic relation assign ordinal values to top 10 features and map those feature on train and test datasets 
  def fit(self,X:pd.DataFrame,y:pd.Series = None):
    X[self.target] = y
    X[self.target] = X[self.target].fillna(X[self.target].median())
       
    if X[self.target].dtypes != float:
      X[self.target] = X[self.target].str.replace(',','').astype(float) 

    if isinstance(self.variables,list):
      self.listed_in_ranks = list()
      for index in range(0,len(self.variables)):
        if len(df[self.variables[index]].unique()) > 10:
          # Get unique top 10 appering categories
          self.listed_in_ranks.append({f'{self.variables[index]}':list(X.groupby(self.variables[index])[self.target].mean().sort_values(ascending=False)[:self.tol[index]].to_dict().keys())})
        else:
          # Get unique top 10 appering categories
          self.listed_in_ranks.append({f'{self.variables[index]}':list(X.groupby(self.variables[index])[self.target].mean().sort_values(ascending=False).to_dict().keys())})
    
    if isinstance(self.variables,str):
      self.listed_in_ranks = list()
      if len(df[self.variables].unique()) > 10:
          # Get unique top 10 appering categories
          self.listed_in_ranks = list(X.groupby(self.variables)[self.target].mean().sort_values(ascending=False)[:self.tol].to_dict().keys())
      else:
          # Get unique top 10 appering categories
          self.listed_in_ranks = list(X.groupby(self.variables)[self.target].mean().sort_values(ascending=False).to_dict().keys())   

    return self

  def transform(self,X):

    if isinstance(self.variables,list):
      self.listed_in_dict = list()
      list_in_dict = dict()
      for index in range(0,len(self.variables)):
        # replacing non top categories  
        X[self.variables[index]] = X[self.variables[index]].apply(lambda value:value if value in self.listed_in_ranks[index][self.variables[index]] else 'Rare')   
              
        self.listed_in_ranks[index][self.variables[index]] = self.listed_in_ranks[index][self.variables[index]]+ ['Rare']
        # # Creating dictionary for mapping categories 
        for indx in range(0,len(self.listed_in_ranks[index][self.variables[index]])):
           list_in_dict[self.listed_in_ranks[index][self.variables[index]][indx]] = indx
        self.listed_in_dict.append({self.variables[index]:list_in_dict})

        list_in_dict = dict()
        # # replacing categories
        X[self.variables[index]] = X[self.variables[index]].map(self.listed_in_dict[index][self.variables[index]]) 
    
    if isinstance(self.variables,str):
      self.listed_in_dict = dict()
      # replacing non top categories  
      X[self.variables] = X[self.variables].apply(lambda value:value if value in self.listed_in_ranks else 'Rare')   
            
      self.listed_in_ranks = self.listed_in_ranks + ['Rare']
      # # Creating dictionary for mapping categories 
      for index in range(0,len(self.listed_in_ranks)):
        self.listed_in_dict[self.listed_in_ranks[index]] = index

      # # replacing categories
      X[self.variables] = X[self.variables].map(self.listed_in_dict) 

    return X

In [69]:
class featureSelection(BaseEstimator, TransformerMixin):
  def __init__(self,alpha,random_state):
    self.selected_feats = None
    self.__alpha = alpha
    self.__random_state = random_state 
    self.sel_ = None

  def fit(self,X,y):
   self.sel_ = SelectFromModel(Lasso(alpha=self.__alpha, random_state=self.__random_state))
   self.sel_.fit(X, y)  
   return self

  def transform(self,X): 
   self.selected_feats = X_train.columns[(self.sel_.get_support())]
   return X[self.selected_feats]

In [71]:
class ZomatoModelTrain(handleMixLabels,handleRankingLabels,featureSelection):

  def __init__(self,df):
    self.__df = df
    self.__target = 'approx_cost(for two people)'
    self.__df[self.__target] = self.__df[self.__target].str.replace(',','').astype(float)
    self.__random_state = 100
    self.__alpha = 0.001
    self.__test_size = 0.33
    self.__handleRankingLabels = handleRankingLabels
    self.__handleMixLabels = handleMixLabels
    self.__featureSelection = featureSelection
    self.__variable_to_drop = ['url','address','phone','reviews_list','name','dish_liked','menu_item'] + [self.__target]
    self.__AddMissingIndicatorVariables = ['votes','rate']
    self.__MeanMedianImputerVarables = ['votes']
    self.__CategoricalImputerModeVarables = ['rate','cuisines','rest_type']
    self.__LogTransformerVarables = ['votes']
    self.__OrdinalEncoderVariables = ['rate']
    self.__OneHotEncoderVariables = ['online_order','book_table']
    self.__handleRankingLabels_var = {'variables':['listed_in(type)','listed_in(city)','location'],'tolerance':[10,15,15]}
    self.__handleMixLabels_var ={'variables':['rest_type','cuisines'],'tolerance':[10,15]}


  def applyModelTrain(self):
    X_train,X_test,y_train,y_test = self.__dataSpliter()
    X_train,X_test = self.__dataCleanar(X_train,X_test)  
    return X_train,X_test,y_train.fillna(y_train.median()),y_test.fillna(y_test.median())

  # Splits the data into train and test set
  def __dataSpliter(self):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(self.__variable_to_drop,axis=1), df[self.__target], test_size=self.__test_size, random_state=42)
    X_train,X_test,y_train,y_test = X_train.reset_index().drop('index',axis=1),X_test.reset_index().drop('index',axis=1),y_train.reset_index().drop('index',axis=1),y_test.reset_index().drop('index',axis=1)
    X_train,X_test = self.__dataCleanar(X_train,X_test)
    return X_train,X_test,y_train,y_test

  # This method andle noisy data from rate and votes columns
  def __dataCleanar(self,X_train,X_test):
    # replacing '-' with nan in rate variable
    X_train['rate'] = X_train['rate'].replace('-',np.nan)  
    X_test['rate'] = X_test['rate'].replace('-',np.nan)

    # replacing '0' with nan in votes variable
    X_train['votes'] = X_train['votes'].replace(0,np.nan)
    X_test['votes'] = X_test['votes'].replace(0,np.nan)
    
    # replacing '/5' with ' in rates variable
    X_train['rate'] = X_train['rate'].apply(lambda value:str(value).replace('/5',''))
    X_test['rate'] = X_test['rate'].apply(lambda value:str(value).replace('/5',''))
    
    X_train['rate'] = X_train['rate'].apply(lambda value: 0 if value == 'NEW' else value).astype(float)
    X_test['rate'] = X_test['rate'].apply(lambda value: 0 if value == 'NEW' else value).astype(float)

    return X_train,X_test

  # This method is for rate column where based on certain ranges we made this column to categorical oridinal variable
  def handleRating(self,X_train,X_test,feature):
    return np.where(X_train[feature]==np.nan,np.nan,np.where(X_train[feature]==0,'New',np.where(X_train[feature]<2.5,'Poor',np.where((X_train[feature]>2.5) | (X_train[feature]<3.5),'Average','Good')))),np.where(X_test[feature]==np.nan,np.nan,np.where(X_test[feature]==0,'New',np.where(X_test[feature]<2.5,'Poor',np.where((X_test[feature]>2.5) | (X_test[feature]<3.5),'Average','Good')))) 

  def featurePipeline(self):
    pipe = Pipeline([
      #  Missing indicator
      ('Add missing indicator',AddMissingIndicator(
          variables=self.__AddMissingIndicatorVariables)),
    
      #   Median Missing Imputation
      ('Median Missing Imputation',MeanMedianImputer(
          imputation_method='median', variables=self.__MeanMedianImputerVarables)),

      #   Mode Missing Imputation
      ('Mode Missing Imputation',CategoricalImputer(
          imputation_method='frequent', variables=self.__CategoricalImputerModeVarables)),        

      # Feature Transformation
      ('LogTransformer',LogTransformer(
          variables=self.__LogTransformerVarables)),

      #  Ordinal Encoder
      ('OrdinalEncoder',OrdinalEncoder(
          encoding_method='ordered',variables=self.__OrdinalEncoderVariables)),

      #  OneHotEncoder
      ('OneHotEncoder',OneHotEncoder(
          drop_last=True,variables=self.__OneHotEncoderVariables)),

      # handleRankingLabels
      ('handleRankingLabels listed_in(type)',self.__handleRankingLabels(
          variables=self.__handleRankingLabels_var['variables'],tol=self.__handleRankingLabels_var['tolerance'],target=self.__target)),        

      #  handleMixLabels Imputation
      ('handleMixLabels Imputation cuisines', self.__handleMixLabels(
          variables=self.__handleMixLabels_var['variables'],tol=self.__handleMixLabels_var['tolerance'],target=self.__target)), 

      #  feature selection
      ('feature selection',featureSelection(alpha=self.__alpha, random_state=self.__random_state)),

      # # Model
      # ('Random Forest Model',RandomForestRegressor(bootstrap= True,max_depth= 10,min_samples_leaf= 2,min_samples_split= 2,n_estimators= 100,oob_score= True))
    ])

    return pipe


In [72]:
df = pd.read_csv('/content/gdrive/MyDrive/zomato.csv')

In [73]:
zmt = ZomatoModelTrain(df)

In [74]:
X_train,X_test,y_train,y_test = zmt.applyModelTrain()

In [75]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34650 entries, 0 to 34649
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   online_order     34650 non-null  object 
 1   book_table       34650 non-null  object 
 2   rate             29422 non-null  float64
 3   votes            27944 non-null  float64
 4   location         34638 non-null  object 
 5   rest_type        34514 non-null  object 
 6   cuisines         34621 non-null  object 
 7   listed_in(type)  34650 non-null  object 
 8   listed_in(city)  34650 non-null  object 
dtypes: float64(2), object(7)
memory usage: 2.4+ MB


In [76]:
X_train['rate'],X_test['rate'] = zmt.handleRating(X_train,X_test,'rate')

In [77]:
X_train['rate'].unique(),X_test['rate'].unique()

(array(['Good', 'Average', 'New', 'Poor'], dtype=object),
 array(['Average', 'New', 'Good', 'Poor'], dtype=object))

In [78]:
pipe = zmt.featurePipeline()

In [79]:
pipe = pipe.fit(X_train,y_train)

In [None]:
X_train = pipe.transform(X_train)

In [None]:
X_train.sample(5)