<center>
    <big><b>Machine Learning Models</b></big>
</center>

After **exploring** the data and clean it, now it's the machine learning modelling time.

# Imports

In [1]:
# data wrangling
import pandas as pd
import numpy as np

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import  ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score

# pickle
import pickle

# warnings
import warnings
warnings.filterwarnings('ignore')

# Get Data

In [2]:
df = pd.read_parquet('zomato_cleaned.parquet')

# Preprocessing

In [3]:
df.head()

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,dish_liked,cuisines,cost,reviews_list,menu_item,type,city
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8,918,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


# Convert the online categorical variables

In [4]:
df['online_order'][df['online_order'] == 'Yes'] = 1
df['online_order'][df['online_order'] == 'No'] = 0

In [5]:
df['online_order'].value_counts(normalize = True)

1    0.704491
0    0.295509
Name: online_order, dtype: float64

In [6]:
df['online_order'] = pd.to_numeric(df['online_order'])

# Change the string categorical into a categorical int

In [7]:
df['book_table'][df['book_table'] == 'Yes'] = 1
df['book_table'][df['book_table'] == 'No'] = 0

In [8]:
df['book_table'] = pd.to_numeric(df['book_table'])

In [9]:
df['book_table'].value_counts(normalize = True)

0    0.739461
1    0.260539
Name: book_table, dtype: float64

# Encoding categorical variables

In [10]:
le = LabelEncoder()

In [11]:
df['location'] = le.fit_transform(df['location'])
df['rest_type'] = le.fit_transform(df['rest_type'])
df['cuisines'] = le.fit_transform(df['cuisines'])
df['menu_item'] = le.fit_transform(df['menu_item'])

In [12]:
df.head()

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,dish_liked,cuisines,cost,reviews_list,menu_item,type,city
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,1,1,4.1,775,1,20,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...",1386,800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",5047,Buffet,Banashankari
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,1,0,4.1,787,1,20,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...",594,800,"[('Rated 4.0', 'RATED\n Had been here for din...",5047,Buffet,Banashankari
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,1,0,3.8,918,1,16,"Churros, Cannelloni, Minestrone Soup, Hot Choc...",484,800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",5047,Buffet,Banashankari
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,0,0,3.7,88,1,62,Masala Dosa,1587,300,"[('Rated 4.0', ""RATED\n Great food and proper...",5047,Buffet,Banashankari
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,0,0,3.8,166,4,20,"Panipuri, Gol Gappe",1406,600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",5047,Buffet,Banashankari


# Column selection

In [13]:
my_data = df.iloc[:,[2,3,4,5,6,7,9,10,12]]
my_data.to_csv('Zomato_df.csv') 

This dataset will be used to build the web application at the end.

In [24]:
my_data.corr().round(2)

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,menu_item
online_order,1.0,-0.14,-0.04,-0.08,-0.1,0.05,-0.02,-0.29,-0.34
book_table,-0.14,1.0,0.35,0.33,0.11,-0.22,0.01,0.6,0.09
rate,-0.04,0.35,1.0,0.38,0.11,-0.15,-0.1,0.31,0.01
votes,-0.08,0.33,0.38,1.0,0.05,-0.09,-0.05,0.31,0.03
location,-0.1,0.11,0.11,0.05,1.0,-0.08,-0.0,0.2,0.02
rest_type,0.05,-0.22,-0.15,-0.09,-0.08,1.0,0.26,-0.26,-0.04
cuisines,-0.02,0.01,-0.1,-0.05,-0.0,0.26,1.0,-0.05,0.03
cost,-0.29,0.6,0.31,0.31,0.2,-0.26,-0.05,1.0,0.13
menu_item,-0.34,0.09,0.01,0.03,0.02,-0.04,0.03,0.13,1.0


In [14]:
x = df.iloc[:,[2,3,5,6,7,9,10,12]]
x.head()

Unnamed: 0,online_order,book_table,votes,location,rest_type,cuisines,cost,menu_item
0,1,1,775,1,20,1386,800,5047
1,1,0,787,1,20,594,800,5047
2,1,0,918,1,16,484,800,5047
3,0,0,88,1,62,1587,300,5047
4,0,0,166,4,20,1406,600,5047


In [15]:
y = df['rate']
y

0        4.1
1        4.1
2        3.8
3        3.7
4        3.8
        ... 
23243    3.8
23244    3.9
23245    2.8
23246    2.5
23247    4.3
Name: rate, Length: 23248, dtype: float64

# Train and test datasets

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 10)

# Model Building

## Linear Regression

In [17]:
# linear regression instance
lr_model = LinearRegression()
# fitting model
lr_model.fit(x_train, y_train)
# predicted values
y_pred = lr_model.predict(x_test)
# r2 score
r2_score(y_test, y_pred)

0.22818828522967283

## Decision Tree Regressor

In [18]:
# decision tree regressor
dt_model = DecisionTreeRegressor()
# fitting model
dt_model.fit(x_train, y_train)
# predicted values
y_pred = dt_model.predict(x_test)
# r2 score
r2_score(y_test, y_pred)

0.8747708235220103

# Random Forest Regressor

In [19]:
# random forest regressor
RF_model = RandomForestRegressor()
# fitting model
RF_model.fit(x_train, y_train)
# predicted values
y_pred = RF_model.predict(x_test)
# r2 score
r2_score(y_test, y_pred)

0.9067593669166809

# ExtraTree Regressor

In [20]:
# random forest regressor
ET_model = ExtraTreesRegressor()
# fitting model
ET_model.fit(x_train, y_train)
# predicted values
y_pred = ET_model.predict(x_test)
# r2 score
r2_score(y_test, y_pred)

0.9324422835463898

The extra tree regressor gives the best model. Now it'll be used pickle to save the model.

In [21]:
# saving model
pickle.dump(ET_model, open('model.pkl', 'wb'))
model = pickle.load(open('model.pkl', 'rb'))