In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from scipy.stats import uniform
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset=pd.read_csv("clean_restaurant_data.csv")
dataset

Unnamed: 0,Rating,Seating Capacity,Average Meal Price,Marketing Budget,Social Media Followers,Chef Experience Years,Number of Reviews,Avg Review Length,Ambience Score,Service Quality Score,...,Weekday Reservations,Revenue,Location_Rural,Location_Suburban,Cuisine_French,Cuisine_Indian,Cuisine_Italian,Cuisine_Japanese,Cuisine_Mexican,Parking Availability_Yes
0,4.0,38,73.98,2224,23406,13,185,161.924906,1.3,7.0,...,4,638945.52,1,0,0,0,0,1,0,1
1,3.2,76,28.11,4416,42741,8,533,148.759717,2.6,3.4,...,6,490207.83,0,0,0,0,0,0,1,1
2,4.7,48,48.29,2796,37285,18,853,56.849189,5.3,6.7,...,14,541368.62,1,0,0,0,1,0,0,0
3,4.4,34,51.55,1167,15214,13,82,205.433265,4.6,2.8,...,17,404556.80,1,0,0,0,1,0,0,1
4,4.9,88,75.98,3639,40171,9,78,241.681584,8.6,2.1,...,26,1491046.35,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8363,3.4,54,34.85,1102,11298,11,380,253.919515,9.5,5.0,...,0,434653.45,0,1,0,1,0,0,0,1
8364,3.7,49,36.88,1988,20432,9,713,175.590195,2.7,2.6,...,21,414977.92,1,0,0,1,0,0,0,0
8365,4.7,88,46.87,5949,63945,6,436,222.953647,4.8,1.7,...,21,930395.87,0,0,0,0,1,0,0,1
8366,3.1,31,44.53,707,7170,1,729,178.482851,6.1,2.1,...,21,311493.48,1,0,0,0,0,0,0,0


In [3]:
dataset.shape

(8368, 21)

In [4]:
dataset=pd.get_dummies(dataset,dtype=int,drop_first=True)
dataset

Unnamed: 0,Rating,Seating Capacity,Average Meal Price,Marketing Budget,Social Media Followers,Chef Experience Years,Number of Reviews,Avg Review Length,Ambience Score,Service Quality Score,...,Weekday Reservations,Revenue,Location_Rural,Location_Suburban,Cuisine_French,Cuisine_Indian,Cuisine_Italian,Cuisine_Japanese,Cuisine_Mexican,Parking Availability_Yes
0,4.0,38,73.98,2224,23406,13,185,161.924906,1.3,7.0,...,4,638945.52,1,0,0,0,0,1,0,1
1,3.2,76,28.11,4416,42741,8,533,148.759717,2.6,3.4,...,6,490207.83,0,0,0,0,0,0,1,1
2,4.7,48,48.29,2796,37285,18,853,56.849189,5.3,6.7,...,14,541368.62,1,0,0,0,1,0,0,0
3,4.4,34,51.55,1167,15214,13,82,205.433265,4.6,2.8,...,17,404556.80,1,0,0,0,1,0,0,1
4,4.9,88,75.98,3639,40171,9,78,241.681584,8.6,2.1,...,26,1491046.35,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8363,3.4,54,34.85,1102,11298,11,380,253.919515,9.5,5.0,...,0,434653.45,0,1,0,1,0,0,0,1
8364,3.7,49,36.88,1988,20432,9,713,175.590195,2.7,2.6,...,21,414977.92,1,0,0,1,0,0,0,0
8365,4.7,88,46.87,5949,63945,6,436,222.953647,4.8,1.7,...,21,930395.87,0,0,0,0,1,0,0,1
8366,3.1,31,44.53,707,7170,1,729,178.482851,6.1,2.1,...,21,311493.48,1,0,0,0,0,0,0,0


In [5]:
dataset.isnull().sum()

Rating                      0
Seating Capacity            0
Average Meal Price          0
Marketing Budget            0
Social Media Followers      0
Chef Experience Years       0
Number of Reviews           0
Avg Review Length           0
Ambience Score              0
Service Quality Score       0
Weekend Reservations        0
Weekday Reservations        0
Revenue                     0
Location_Rural              0
Location_Suburban           0
Cuisine_French              0
Cuisine_Indian              0
Cuisine_Italian             0
Cuisine_Japanese            0
Cuisine_Mexican             0
Parking Availability_Yes    0
dtype: int64

In [6]:
indep_X=dataset.drop("Revenue", axis=1)
dep_Y=dataset["Revenue"]

In [7]:
n = 6
test = SelectKBest(score_func=f_regression, k=n)
test.fit(indep_X, dep_Y)
selected_features = test.transform(indep_X)

In [8]:
selected_columns = indep_X.columns[test.get_support()]
selected_columns

Index(['Seating Capacity', 'Average Meal Price', 'Marketing Budget',
       'Location_Rural', 'Cuisine_Japanese', 'Cuisine_Mexican'],
      dtype='object')

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(selected_features, dep_Y, test_size=0.30, random_state=0)
pipeline = Pipeline([('scale', StandardScaler()), ('rf', RandomForestRegressor(random_state = 0))])

In [10]:
param_dist = {'rf__n_estimators': [50, 100, 200], 'rf__max_depth': [None, 10, 20], 'rf__min_samples_split': [2, 5], 
              'rf__min_samples_leaf': [1, 2], 'rf__bootstrap': [True, False]}
search = RandomizedSearchCV(pipeline,param_distributions=param_dist, n_iter=10, scoring='r2', cv=3, random_state=42, n_jobs=-1)
search.fit(X_train, Y_train)
best_model = search.best_estimator_  

In [11]:
r2 = search.score(X_test, Y_test)

print("Best Random Forest Model:", best_model)
print("R² Score:", r2)

Best Random Forest Model: Pipeline(steps=[('scale', StandardScaler()),
                ('rf',
                 RandomForestRegressor(min_samples_leaf=2, min_samples_split=5,
                                       random_state=0))])
R² Score: 0.9990386760016561


In [12]:
with open('finalized_model_SelectK_restaurant_data.sav', 'wb') as f:
    pickle.dump(best_model, f)

In [13]:
loaded_model=pickle.load(open("finalized_model_SelectK_restaurant_data.sav",'rb'))
result=loaded_model.predict([[61, 73.65, 3213, 0, 1, 0]])
result

array([1006983.74443028])