<a href="https://colab.research.google.com/github/SarkarPriyanshu/Machine-Learning-Models/blob/main/Zomato_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install feature_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature_engine
  Downloading feature_engine-1.5.2-py2.py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.0/290.0 KB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature_engine
Successfully installed feature_engine-1.5.2


In [2]:
import ast
import re
import pandas as pd
import numpy as np
from google.colab import drive

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
 
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer,AddMissingIndicator,CategoricalImputer
from feature_engine.transformation import LogTransformer
from feature_engine.encoding import OrdinalEncoder,OneHotEncoder

# to build the models
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings('ignore')

In [3]:
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [202]:
df = pd.read_csv('/content/gdrive/MyDrive/zomato.csv')

In [17]:
class handleMixLabels:
  def __init__(self):
    self.keys = list()
    self.values = list()
    pass

  # Step 1 groupby category in mix labels
  def groupbyCategory(self,feature,target,test):
    for key,value in test.groupby(feature)[target].mean().sort_values(ascending=False).items():
      self.keys.append(key)
      self.values.append(value)

  # step 2 get all unique labels from feature column
  def getalluniquelabels(self,test,feature):
    unique_rest_type = list()
    for value in test[feature]:
      if ',' in value:
        for item in value.split(','):
          if item.strip() not in unique_rest_type:
            unique_rest_type.append(item.strip())
      else:
        if value.strip() not in unique_rest_type:
          unique_rest_type.append(value.strip())  
    return unique_rest_type

    # top unique labels from feature columns
  def gettopuniquelabels(self,limit):
    unique_rest_type_ = list()
    for value in self.keys[:limit]:
      if ',' in value:
          for item in value.split(','):
            if item.strip() not in unique_rest_type_:
              unique_rest_type_.append(item.strip())
      else:
        if value.strip() not in unique_rest_type_:
          unique_rest_type_.append(value.strip())
    return unique_rest_type_

  # get rare labels from feature columns
  def getrarelabels(self,feature,target,test,X_train,X_test,limit):
    self.groupbyCategory(feature,target,test)
    unique_rest_type_rare = [value for value in self.getalluniquelabels(test,feature) if value not in self.gettopuniquelabels(limit)]
    unique_rest_type_ = self.gettopuniquelabels(limit) + ['Rare']

    # adding new columns of unique labels in datasets
    for value in unique_rest_type_:
      X_train[f'{feature}_{value}'] = np.zeros(X_train.shape[0])
      X_test[f'{feature}_{value}'] = np.zeros(X_test.shape[0])
            
      X_train[f'{feature}_{value}'].astype(int) 
      X_test[f'{feature}_{value}'].astype(int)   

    # Adding 1 and 0's to those newly added columns
    for value in unique_rest_type_:
      for index in range(0,X_test.shape[0]):
        if value in X_test[feature][index]:
          X_test[f'{feature}_{value}'][index] = 1

        if  ',' in  X_test[feature][index]:
          for item in X_test[feature][index].split(','):
            if item.strip() in unique_rest_type_rare:
              X_test[f'{feature}_Rare'][index] = 1
              

    for value in unique_rest_type_:
      for index in range(0,X_train.shape[0]):
        if value in X_train[feature][index]:
          X_train[f'{feature}_{value}'][index] = 1

        if  ',' in  X_train[feature][index]:
          for item in X_train[feature][index].split(','):
            if item.strip() in unique_rest_type_rare:
              X_train[f'{feature}_Rare'][index] = 1             
    
    X_train = X_train.drop(feature,axis=1)
    X_test = X_test.drop(feature,axis=1)  
    
    return X_test,X_train  


In [190]:
class ZomatoModelTrain(handleMixLabels):

  def __init__(self,df):
    self.__df = df
    self.__target = 'approx_cost(for two people)'
    self.__random_state = 100
    self.__alpha = 0.001
    self.__test_size = 0.33

    self.__variable_to_drop = ['url','address','phone','reviews_list','name','dish_liked','menu_item'] + [self.__target]
    self.__handleMixLabels = handleMixLabels()
    self.__AddMissingIndicatorVariables = ['votes','rate']
    self.__MeanMedianImputerVarables = ['votes']
    self.__CategoricalImputerModeVarables = ['rate','location','cuisines','rest_type']
    self.__LogTransformerVarables = ['votes']
    self.__OrdinalEncoderVariables = ['rate','location','listed_in(type)','listed_in(city)']
    self.__OneHotEncoderVariables = ['online_order','book_table','rest_type','cuisines']

  def applyModelTrain(self):
    X_train,X_test,y_train,y_test = self.__dataSpliter()
    y_train = y_train.astype(float)
    y_test = y_train.astype(float)

    X_train,X_test = self.__dataCleanar(X_train,X_test)  
    return X_train,X_test,y_train.fillna(y_train.median()),y_test.fillna(y_test.median())

  def __dataSpliter(self):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(self.__variable_to_drop,axis=1), df[self.__target], test_size=self.__test_size, random_state=42)
    X_train,X_test,y_train,y_test = X_train.reset_index().drop('index',axis=1),X_test.reset_index().drop('index',axis=1),y_train.str.replace(',',''),y_test.str.replace(',','')
    X_train,X_test = self.__dataCleanar(X_train,X_test)
    return X_train,X_test,y_train,y_test

  def __dataCleanar(self,X_train,X_test):
    # replacing '-' with nan in rate variable
    X_train['rate'] = X_train['rate'].replace('-',np.nan)  
    X_test['rate'] = X_test['rate'].replace('-',np.nan)

    # replacing '0' with nan in votes variable
    X_train['votes'] = X_train['votes'].replace(0,np.nan)
    X_test['votes'] = X_test['votes'].replace(0,np.nan)
    
    # replacing '/5' with ' in rates variable
    X_train['rate'] = X_train['rate'].apply(lambda value:str(value).replace('/5',''))
    X_test['rate'] = X_test['rate'].apply(lambda value:str(value).replace('/5',''))
    
    X_train['rate'] = X_train['rate'].apply(lambda value: 0 if value == 'NEW' else value).astype(float)
    X_test['rate'] = X_test['rate'].apply(lambda value: 0 if value == 'NEW' else value).astype(float)

    # X_train['rate'] = np.where(X_train['rate']==np.nan,np.nan,np.where(X_train['rate']==0,'New',np.where(X_train['rate']<2.5,'Poor',np.where((X_train['rate']>2.5) | (X_train['rate']<3.5),'Average','Good'))))
    # X_test['rate'] = np.where(X_test['rate']==np.nan,np.nan,np.where(X_test['rate']==0,'New',np.where(X_test['rate']<2.5,'Poor',np.where((X_test['rate']>2.5) | (X_test['rate']<3.5),'Average','Good')))) 

    return X_train,X_test


  def __handleRanking(feature,target,df,X_train,X_test):
    if len(df[feature].unique()) > 15:
      # Get unique top 10 appering categories
      listed_in_ranks = list(df.groupby(feature)[target].mean().sort_values(ascending=False)[:10].to_dict().keys())

      # replacing non top categories  
      X_train[feature] = X_train[feature].apply(lambda value:value if value in listed_in_ranks else 'Rare')
      X_test[feature] = X_test[feature].apply(lambda value:value if value in listed_in_ranks else 'Rare')   
      
      listed_in_ranks = listed_in_ranks + ['Rare']

      # Creating dictionary for mapping categories 
      listed_in_dict = dict()
      for index in range(0,len(listed_in_ranks)):
        listed_in_dict[listed_in_ranks[index]] = index

      # replacing categories
      X_train[feature] = X_train[feature].map(listed_in_dict)
      X_test[feature] = X_test[feature].map(listed_in_dict)

    else:
        # Get unique top 10 appering categories
      listed_in_ranks = list(df.groupby(feature)[target].mean().sort_values(ascending=False)[:10].to_dict().keys())

      # Creating dictionary for mapping categories 
      listed_in_dict = dict()
      for index in range(0,len(listed_in_ranks)):
        listed_in_dict[listed_in_ranks[index]] = index

      # replacing categories
      X_train[feature] = X_train[feature].map(listed_in_dict)
      X_test[feature] = X_test[feature].map(listed_in_dict)  

    return X_train,X_test     


  def __featurePipeline(self):
    return Pipeline([
      
      # Missing indicator
      ('Add missing indicator',AddMissingIndicator(variables=self.__AddMissingIndicatorVarables)),

      # Median Missing Imputation
      ('Median Missing Imputation',MeanMedianImputer(imputation_method='median', variables=self.__MeanMedianImputerVarables)), 

      # Mode Missing Imputation
      ('Mode Missing Imputation',CategoricalImputer(imputation_method='frequent', variables=self.__CategoricalImputerModeVarables))  

      # Feature Transformation
      ('LogTransformer',LogTransformer(variables=self.__LogTransformerVarables)),

      # Ordinal Encoder
      ('OrdinalEncoder',OrdinalEncoder(encoding_method='ordered',variables=self.__OrdinalEncoderVariables,unseen='ignore')),

      # OneHotEncoder
      ('OneHotEncoder',OneHotEncoder(drop_last=True,variables=self.__OneHotEncoderVariables)),

      # # feature selection
      # ('feature selection',SelectFromModel(Lasso(alpha=self.__alpha, random_state=self.__random_state)))

    ])






In [197]:
zmt = ZomatoModelTrain(df)

In [203]:
X_train,X_test,y_train,y_test = zmt.applyModelTrain()

In [204]:
X_train.sample(5)

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,listed_in(type),listed_in(city)
15717,Yes,No,3.7,66.0,Sarjapur Road,Quick Bites,North Indian,Delivery,Sarjapur Road
19917,Yes,No,4.1,291.0,Koramangala 6th Block,Quick Bites,"Burger, Pizza, Momos",Delivery,MG Road
30125,Yes,No,3.9,332.0,Koramangala 1st Block,Casual Dining,"Chinese, Thai",Delivery,Koramangala 6th Block
21518,Yes,Yes,4.1,587.0,JP Nagar,"Casual Dining, Bar","Continental, Finger Food, North Indian, Chinese",Pubs and bars,BTM
2940,Yes,No,3.3,4.0,BTM,Beverage Shop,Beverages,Delivery,Koramangala 4th Block


In [205]:
X_train['rate'] = np.where(X_train['rate']==np.nan,np.nan,np.where(X_train['rate']==0,'New',np.where(X_train['rate']<2.5,'Poor',np.where((X_train['rate']>2.5) | (X_train['rate']<3.5),'Average','Good'))))

In [206]:
X_train['rate'].value_counts()

Average    27800
Good        5228
New         1494
Poor         128
Name: rate, dtype: int64

In [207]:
X_train.sample(5)

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,listed_in(type),listed_in(city)
9721,Yes,No,Average,377.0,BTM,Quick Bites,"North Indian, Kerala",Delivery,Koramangala 6th Block
2639,Yes,No,Average,10.0,Indiranagar,Quick Bites,"North Indian, Chinese",Delivery,Indiranagar
29225,No,Yes,Average,1520.0,Koramangala 5th Block,Casual Dining,"Mughlai, Biryani",Dine-out,BTM
32822,Yes,No,Average,39.0,Whitefield,Quick Bites,"North Indian, Street Food",Dine-out,Brookefield
11733,No,No,Average,199.0,Brookefield,Casual Dining,"Biryani, North Indian, Chinese",Dine-out,Brookefield
