<a href="https://colab.research.google.com/github/SarkarPriyanshu/Machine-Learning-Models/blob/main/Zomato_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install feature_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature_engine
  Downloading feature_engine-1.5.2-py2.py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.0/290.0 KB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature_engine
Successfully installed feature_engine-1.5.2


In [2]:
import ast
import re
import pandas as pd
import numpy as np
from google.colab import drive

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
 
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer,AddMissingIndicator,CategoricalImputer
from feature_engine.transformation import LogTransformer
from feature_engine.encoding import OrdinalEncoder,OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin

# to build the models
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings('ignore')

In [3]:
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [72]:
class handleMixLabels(BaseEstimator, TransformerMixin):
  def __init__(self,variables,target,limit=10):
    self.variables = variables
    self.keys = list()
    self.values = list()
    self.limit = limit
    self.target = target
    self.all_unique_feature_type = list()
    self.top_unique_feature_type_ = list()
    self.unique_rare_feature_type_ = list()
    pass
  
  def fit(self, X: pd.DataFrame,y:pd.Series):
        X = X.dropna().copy()
        X[self.target] = y
        X[self.target] = X[self.target].fillna(X[self.target].median())
       
        if X[self.target].dtypes != float:
          X[self.target] = X[self.target].str.replace(',','').astype(float)
        
        for key,value in X.groupby(self.variables)[self.target].mean().sort_values(ascending=False).items():
                  self.keys.append(key)
                  self.values.append(value)

        for value in X[self.variables]:
          if ',' in value:
            for item in value.split(','):
              if item.strip() not in self.all_unique_feature_type:
                self.all_unique_feature_type.append(item.strip())
          else:
            if value.strip() not in self.all_unique_feature_type:
              self.all_unique_feature_type.append(value.strip())  
        
        for value in self.keys[:self.limit]:
          if isinstance(value,tuple):
            for item in value:
              if item.strip() not in self.top_unique_feature_type_:
                  self.top_unique_feature_type_.append(item.strip())
          elif ',' in value:
              for item in value.split(','):
                if item.strip() not in self.top_unique_feature_type_:
                  self.top_unique_feature_type_.append(item.strip())
          else:
            if value.strip() not in self.top_unique_feature_type_:
              self.top_unique_feature_type_.append(value.strip())

        self.top_unique_feature_type_ = self.top_unique_feature_type_ + ['Rare']
        self.unique_rare_feature_type_ = [value for value in self.all_unique_feature_type if value not in self.top_unique_feature_type_]

        return self

  def transform(self, X: pd.DataFrame):
        X = X.copy()
        
        # adding new columns of unique labels in datasets
        for value in self.top_unique_feature_type_:
          X[f'{self.variables}_{value}'] = np.zeros(X.shape[0])

        # Adding 1 and 0's to those newly added columns
        for value in self.top_unique_feature_type_:
          for index in range(0,X.shape[0]):
            if  ',' in  X[self.variables][index]:
               for item in X[self.variables][index].split(','):
                 if item not in self.unique_rare_feature_type_:
                    X[f'{self.variables}_{value}'][index] = 1
                 if item.strip() in self.unique_rare_feature_type_:
                   X[f'{self.variables}_Rare'][index] = 1
            else:
              if value == X[self.variables][index]:
                    X[f'{self.variables}_{value}'][index] = 1
              if X[self.variables][index] in self.unique_rare_feature_type_:
                   X[f'{self.variables}_Rare'][index] = 1      

        return X

In [73]:
class ZomatoModelTrain(handleMixLabels):

  def __init__(self,df):
    self.__df = df
    self.__target = 'approx_cost(for two people)'
    self.__df[self.__target] = self.__df[self.__target].str.replace(',','').astype(float)
    self.__random_state = 100
    self.__alpha = 0.001
    self.__test_size = 0.33
    self.__handleMixLabels = handleMixLabels
    self.__variable_to_drop = ['url','address','phone','reviews_list','name','dish_liked','menu_item'] + [self.__target]
    self.__AddMissingIndicatorVariables = ['votes','rate','location']
    self.__MeanMedianImputerVarables = ['votes','location']
    self.__CategoricalImputerModeVarables = ['rate','cuisines','rest_type']
    self.__LogTransformerVarables = ['votes']
    self.__OrdinalEncoderVariables = ['rate','location','listed_in(type)','listed_in(city)']
    self.__OneHotEncoderVariables = ['online_order','book_table','rest_type','cuisines']

  def applyModelTrain(self):
    X_train,X_test,y_train,y_test = self.__dataSpliter()

    X_train,X_test = self.__dataCleanar(X_train,X_test)  
    
    return X_train,X_test,y_train.fillna(y_train.median()),y_test.fillna(y_test.median())

  # Splits the data into train and test set
  def __dataSpliter(self):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(self.__variable_to_drop,axis=1), df[self.__target], test_size=self.__test_size, random_state=42)
    X_train,X_test,y_train,y_test = X_train.reset_index().drop('index',axis=1),X_test.reset_index().drop('index',axis=1),y_train.reset_index().drop('index',axis=1),y_test.reset_index().drop('index',axis=1)
    X_train,X_test = self.__dataCleanar(X_train,X_test)
    return X_train,X_test,y_train,y_test

  # This method andle noisy data from rate and votes columns
  def __dataCleanar(self,X_train,X_test):
    # replacing '-' with nan in rate variable
    X_train['rate'] = X_train['rate'].replace('-',np.nan)  
    X_test['rate'] = X_test['rate'].replace('-',np.nan)

    # replacing '0' with nan in votes variable
    X_train['votes'] = X_train['votes'].replace(0,np.nan)
    X_test['votes'] = X_test['votes'].replace(0,np.nan)
    
    # replacing '/5' with ' in rates variable
    X_train['rate'] = X_train['rate'].apply(lambda value:str(value).replace('/5',''))
    X_test['rate'] = X_test['rate'].apply(lambda value:str(value).replace('/5',''))
    
    X_train['rate'] = X_train['rate'].apply(lambda value: 0 if value == 'NEW' else value).astype(float)
    X_test['rate'] = X_test['rate'].apply(lambda value: 0 if value == 'NEW' else value).astype(float)

    return X_train,X_test

  # This method is for rate column where based on certain ranges we made this column to categorical oridinal variable
  def handleRating(self,X_train,X_test,feature):
    return np.where(X_train[feature]==np.nan,np.nan,np.where(X_train[feature]==0,'New',np.where(X_train[feature]<2.5,'Poor',np.where((X_train[feature]>2.5) | (X_train[feature]<3.5),'Average','Good')))),np.where(X_test[feature]==np.nan,np.nan,np.where(X_test[feature]==0,'New',np.where(X_test[feature]<2.5,'Poor',np.where((X_test[feature]>2.5) | (X_test[feature]<3.5),'Average','Good')))) 


  # This method checks relation with target columns based on that montonic relation assign ordinal values to top 10 features and map those feature on train and test datasets 
  def handleRanking(self,X_train,X_test,feature,tol=10):
    if len(df[feature].unique()) > 15:
      # Get unique top 10 appering categories
      listed_in_ranks = list(self.__df.groupby(feature)[self.__target].mean().sort_values(ascending=False)[:tol].to_dict().keys())

      # replacing non top categories  
      X_train[feature] = X_train[feature].apply(lambda value:value if value in listed_in_ranks else 'Rare')
      X_test[feature] = X_test[feature].apply(lambda value:value if value in listed_in_ranks else 'Rare')   
      
      listed_in_ranks = listed_in_ranks + ['Rare']

      # Creating dictionary for mapping categories 
      listed_in_dict = dict()
      for index in range(0,len(listed_in_ranks)):
        listed_in_dict[listed_in_ranks[index]] = index

      # replacing categories
      X_train[feature] = X_train[feature].map(listed_in_dict)
      X_test[feature] = X_test[feature].map(listed_in_dict)

    else:
      # Get unique top 10 appering categories
      listed_in_ranks = list(self.__df.groupby(feature)[self.__target].mean().sort_values(ascending=False)[:tol].to_dict().keys())
      
      # Creating dictionary for mapping categories)
      listed_in_dict = dict()
      for index in range(0,len(listed_in_ranks)):
        listed_in_dict[listed_in_ranks[index]] = index

      # replacing categories
      X_train[feature] = X_train[feature].map(listed_in_dict)
      X_test[feature] = X_test[feature].map(listed_in_dict)  

    return X_train,X_test     


  def featurePipeline(self):

    pipe = Pipeline([

      #   Missing indicator
      ('Add missing indicator',AddMissingIndicator(
          variables=self.__AddMissingIndicatorVariables)),
    
      #   Median Missing Imputation
      ('Median Missing Imputation',MeanMedianImputer(
          imputation_method='median', variables=self.__MeanMedianImputerVarables)),

      #   Mode Missing Imputation
      ('Mode Missing Imputation',CategoricalImputer(
          imputation_method='frequent', variables=self.__CategoricalImputerModeVarables)),

      #  handleMixLabels Imputation
      ('handleMixLabels Imputation rest_type', self.__handleMixLabels(
          variables='rest_type', target=self.__target, limit=15)),   

      #  handleMixLabels Imputation
      ('handleMixLabels Imputation cuisines', self.__handleMixLabels(
          variables='cuisines', target=self.__target, limit=15)),         

      # Feature Transformation
      ('LogTransformer',LogTransformer(
          variables=self.__LogTransformerVarables)),

      #  Ordinal Encoder
      ('OrdinalEncoder',OrdinalEncoder(
          encoding_method='ordered',variables=self.__OrdinalEncoderVariables,unseen='ignore')),

      #  OneHotEncoder
      ('OneHotEncoder',OneHotEncoder(
          drop_last=True,variables=self.__OneHotEncoderVariables)),

      # #  feature selection
      # ('feature selection',SelectFromModel(
      #     Lasso(alpha=self.__alpha, random_state=self.__random_state)))
    ])

    return pipe


In [74]:
df = pd.read_csv('/content/gdrive/MyDrive/zomato.csv')

In [75]:
zmt = ZomatoModelTrain(df)

In [76]:
X_train,X_test,y_train,y_test = zmt.applyModelTrain()

In [77]:
X_train['rate'],X_test['rate'] = zmt.handleRating(X_train,X_test,'rate')

In [78]:
X_train['rate'].unique(),X_test['rate'].unique()

(array(['Good', 'Average', 'New', 'Poor'], dtype=object),
 array(['Average', 'New', 'Good', 'Poor'], dtype=object))

In [79]:
X_train,X_test = zmt.handleRanking(X_train,X_test,'listed_in(type)')

In [80]:
X_train,X_test = zmt.handleRanking(X_train,X_test,'listed_in(city)')

In [81]:
X_train,X_test = zmt.handleRanking(X_train,X_test,'location')

In [82]:
pipe = zmt.featurePipeline()

In [83]:
pipe

Pipeline(steps=[('Add missing indicator',
                 AddMissingIndicator(variables=['votes', 'rate', 'location'])),
                ('Median Missing Imputation',
                 MeanMedianImputer(variables=['votes', 'location'])),
                ('Mode Missing Imputation',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['rate', 'cuisines',
                                               'rest_type'])),
                ('handleMixLabels Imputation rest_type',
                 handleMixLabels(li...
                 handleMixLabels(limit=15, target='approx_cost(for two people)',
                                 variables='cuisines')),
                ('LogTransformer', LogTransformer(variables=['votes'])),
                ('OrdinalEncoder',
                 OrdinalEncoder(variables=['rate', 'location',
                                           'listed_in(type)',
                                           'listed_in(c

In [84]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34650 entries, 0 to 34649
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   online_order     34650 non-null  object 
 1   book_table       34650 non-null  object 
 2   rate             34650 non-null  object 
 3   votes            27944 non-null  float64
 4   location         34650 non-null  int64  
 5   rest_type        34514 non-null  object 
 6   cuisines         34621 non-null  object 
 7   listed_in(type)  34650 non-null  int64  
 8   listed_in(city)  34650 non-null  int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 2.4+ MB


In [85]:
X_train.shape,y_train.shape

((34650, 9), (34650, 1))

In [None]:
pipe.fit(X_train,y_train)

In [18]:
pipe.transform

<function sklearn.pipeline.Pipeline.transform(self, X)>

In [19]:
import threading

In [20]:
print(threading.current_thread().is_alive())

True
