Import all the necessary libraries

In [33]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot as plt
import statistics

Import the dataset

In [37]:
dataset = pd.read_csv("restaurant_data.csv")
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

Check and account for missing variables

In [38]:
print(dataset.isnull().sum())
np.set_printoptions(threshold=np.inf)

Name                      0
Location                  0
Cuisine                   0
Rating                    0
Seating Capacity          0
Average Meal Price        0
Marketing Budget          0
Social Media Followers    0
Chef Experience Years     0
Number of Reviews         0
Avg Review Length         0
Ambience Score            0
Service Quality Score     0
Parking Availability      0
Weekend Reservations      0
Weekday Reservations      0
Revenue                   0
dtype: int64


Encode Categorial Data

In [39]:
column_transformer = ColumnTransformer(transformers = [("encoder", OneHotEncoder(), [0, 1, 12])], remainder = "passthrough")
X = np.array(column_transformer.fit_transform(X))

Split the dataset into testing and training sets

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Apply feature scaling to the dataset

In [41]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Initialize and train the linear regressor

In [42]:
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)

Initialize and train the support vector regressor

In [44]:
svr = SVR(kernel="rbf")
svr.fit(X_train, y_train)

Initialize and train the decision tree regressor

In [45]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)

Initialize and train the random forest regressor

In [51]:
rfr = RandomForestRegressor(n_estimators=10000, verbose=1)
rfr.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed:  8.1min finished


Evaluation of all models by calculation of r^2 scores and then selection of the best model

In [49]:
max_r2 = 0
model_type = ""
best_pred = ()
predictions = (
    linear_regressor.predict(X_test),
    svr.predict(X_test),
    dtr.predict(X_test),
    rfr.predict(X_test)
)

vals = {
    1:"linear regressor",
    2:"svr",
    3:"dtr",
    4:"rfr"
}
r2_scores = []

for i in range(len(predictions)):
    r2 = r2_score(y_test, predictions[i])
    r2_scores.append(r2)
    if r2>max_r2:
        max_r2 = r2
        model_type = vals[i+1]
        best_pred = predictions[i]

np.set_printoptions(precision = 2)
val_comparison = np.concatenate((best_pred.reshape(len(best_pred), 1), y_test.reshape(len(y_test), 1)), 1)

errors = []
for i in range(len(val_comparison)):
    pos = val_comparison[i]
    e = (abs(pos[0]-pos[1])/y_test[i]) * 100
    errors.append(round(e, 2))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.4s finished


In [50]:
print(f"The best model is {model_type} its r2 value is {max_r2} and its average error is {round(statistics.mean(errors), 2)}%")

The best model is rfr its r2 value is 0.9991882676477493 and its average error is 0.99%
