In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
df=pd.read_csv("restaurant_data.csv")
df.head()

Unnamed: 0,online_order,book_table,rate,votes,rest_type,cost,type,cuisine
0,Yes,Yes,4.1,775,Casual Dining,800.0,Buffet,North Indian
1,Yes,Yes,4.1,775,Casual Dining,800.0,Buffet,Mughlai
2,Yes,Yes,4.1,775,Casual Dining,800.0,Buffet,Chinese
3,Yes,No,4.1,787,Casual Dining,800.0,Buffet,Chinese
4,Yes,No,4.1,787,Casual Dining,800.0,Buffet,North Indian


In [3]:
X=df.drop(["rate"],axis=1)
y=df["rate"]

In [4]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

In [5]:
from sklearn.preprocessing import LabelEncoder
label_enc1 = LabelEncoder()

X_train['online_order'] = label_enc1.fit_transform(X_train['online_order'])  # Yes=1, No=0
X_train['book_table'] = label_enc1.fit_transform(X_train['book_table'])      # Yes=1, No=0

X_test['online_order'] = label_enc1.transform(X_test['online_order'])  # Yes=1, No=0
X_test['book_table'] = label_enc1.transform(X_test['book_table'])      # Yes=1, No=0

In [6]:
# Step 1: Split and retain the first value
X_train["rest_type"] = X_train["rest_type"].str.split(",").str[0].str.strip()
X_test["rest_type"] = X_test["rest_type"].str.split(",").str[0].str.strip()

# Step 2: Encode the cleaned `rest_type` column
label_enc2 = LabelEncoder()
X_train["rest_type"] = label_enc2.fit_transform(X_train["rest_type"])
X_test["rest_type"]=label_enc2.transform(X_test["rest_type"])

print(df["rest_type"].isnull().sum())

# Optional: If you want to see the type and their encoded values
unique_rest_type = pd.DataFrame({
    "rest_type": label_enc2.classes_,
    "rest_type_encoded": range(len(label_enc2.classes_))
})
print(unique_rest_type)

0
         rest_type  rest_type_encoded
0           Bakery                  0
1              Bar                  1
2    Beverage Shop                  2
3       Bhojanalya                  3
4             Cafe                  4
5    Casual Dining                  5
6             Club                  6
7    Confectionery                  7
8         Delivery                  8
9   Dessert Parlor                  9
10           Dhaba                 10
11     Fine Dining                 11
12      Food Court                 12
13      Food Truck                 13
14           Kiosk                 14
15          Lounge                 15
16            Mess                 16
17    Microbrewery                 17
18             Pub                 18
19     Quick Bites                 19
20      Sweet Shop                 20
21        Takeaway                 21


In [7]:
df["cuisine"].value_counts()

cuisine
North Indian     10976
Chinese           8104
Continental       3970
Fast Food         3864
South Indian      3858
Cafe              3350
Biryani           3298
Desserts          3033
Beverages         2534
Italian           2308
Others            1650
Street Food       1438
Pizza             1437
Burger            1384
Bakery            1257
Seafood           1172
Andhra            1089
American          1045
Mughlai            992
Ice Cream          973
Asian              971
Finger Food        906
Salad              832
Momos              769
Rolls              737
Kerala             732
Thai               719
Arabian            591
BBQ                561
European           557
Kebab              537
Healthy Food       509
Juices             497
Mithai             497
Sandwich           493
Mediterranean      439
Steak              425
Mangalorean        390
Mexican            356
Bengali            346
Japanese           284
Tea                192
Hyderabadi         181
Tib

In [8]:
label_enc3 = LabelEncoder()
X_train["cuisine"] = label_enc3.fit_transform(X_train["cuisine"])
X_test["cuisine"]=label_enc3.transform(X_test["cuisine"])

# Optional: If you want to see the unique cuisines and their encoded values
unique_cuisines = pd.DataFrame({
    "cuisine": label_enc3.classes_,
    "cuisine_encoded": range(len(label_enc3.classes_))
})
print(unique_cuisines)

          cuisine  cuisine_encoded
0        American                0
1          Andhra                1
2         Arabian                2
3           Asian                3
4             BBQ                4
5          Bakery                5
6         Bengali                6
7       Beverages                7
8         Biryani                8
9          Burger                9
10           Cafe               10
11      Chettinad               11
12        Chinese               12
13    Continental               13
14       Desserts               14
15       European               15
16      Fast Food               16
17    Finger Food               17
18   Healthy Food               18
19     Hyderabadi               19
20      Ice Cream               20
21        Italian               21
22       Japanese               22
23         Juices               23
24          Kebab               24
25         Kerala               25
26         Korean               26
27    Mangalorean   

In [9]:
label_enc4 = LabelEncoder()
X_train["type"] = label_enc4.fit_transform(X_train["type"])
X_test["type"] = label_enc4.transform(X_test["type"])

# Optional: If you want to see the unique listed_in(type) and their encoded values
unique_type = pd.DataFrame({
    "type": label_enc4.classes_,
    "type_encoded": range(len(label_enc4.classes_))
})
print(unique_type)

                 type  type_encoded
0              Buffet             0
1               Cafes             1
2            Delivery             2
3            Desserts             3
4            Dine-out             4
5  Drinks & nightlife             5
6       Pubs and bars             6


In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56847 entries, 54066 to 15795
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   online_order  56847 non-null  int64  
 1   book_table    56847 non-null  int64  
 2   votes         56847 non-null  int64  
 3   rest_type     56847 non-null  int64  
 4   cost          56847 non-null  float64
 5   type          56847 non-null  int64  
 6   cuisine       56847 non-null  int64  
dtypes: float64(1), int64(6)
memory usage: 3.5 MB


In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

results = {}

# train and evaluate each model
for name, model in models.items():
    # training the model
    model.fit(X_train, y_train)

    # making predictions on the test set
    predictions = model.predict(X_test)

    # calculating evaluation metrics
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    # storing the metrics
    results[name] = {"MAE": mae, "R²": r2}

results_df = pd.DataFrame(results).T
print(results_df)

                        MAE        R²
Linear Regression  0.278739  0.346122
Decision Tree      0.080096  0.776888
Random Forest      0.088885  0.856311
Gradient Boosting  0.219800  0.538494


In [12]:
# ## Selecting the Random Forest Algo beacause of high Accuracy

# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import mean_squared_error,r2_score

# model=RandomForestRegressor(random_state=42)

# # Define the hyperparameter grid
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['sqrt', 'log2', None]
# }

# # Perform GridSearchCV
# grid_search = GridSearchCV(
#     estimator=model,
#     param_grid=param_grid,
#     scoring='neg_mean_squared_error',
#     cv=5,
#     verbose=2,
#     n_jobs=-1
# )

# # Fit the model
# grid_search.fit(X_train, y_train)

# # Best hyperparameters
# print("Best Hyperparameters:", grid_search.best_params_)

# # Evaluate on test data
# best_rf = grid_search.best_estimator_
# y_pred = best_rf.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)
# print("Test Mean Squared Error:", mse)
# print("Test R² Score:", r2)

In [13]:
model= RandomForestRegressor(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

0.08888485107411645
0.8563111574443173


In [14]:
# import pickle

# pickle.dump(label_enc1,open("label_enc1.pkl","wb"))
# pickle.dump(label_enc2,open("label_enc2.pkl","wb"))
# pickle.dump(label_enc3,open("label_enc3.pkl","wb"))
# pickle.dump(label_enc4,open("label_enc4.pkl","wb"))
# pickle.dump(model,open("model.pkl","wb"))