Import all necessary libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns


Loading the dataset

In [2]:
df = pd.read_csv('Dataset.csv')

Check for missing values

In [3]:
print(df.isnull().sum())

Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                9
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64


Convert categorical columns to numeric

In [5]:
label_encoders = {}
for column in ['Country Code', 'City', 'Address', 'Locality', 'Locality Verbose', 'Cuisines', 'Currency', 'Rating color', 'Rating text']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

Select features and target

In [6]:
X = df.drop(['Restaurant ID', 'Restaurant Name', 'Aggregate rating'], axis=1)
y = df['Aggregate rating']

Split the data into training and testing sets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Check for remaining missing values
print(X_train.isnull().sum())

# Identify columns with non-numerical data
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        print(f"Column '{col}' contains non-numerical data.")
        print(X_train[col].unique()) # Inspect unique values in the problematic column(s)

# If missing values are present, handle them (e.g., imputation or further dropping)
# If there are columns with non-numerical data that should be numerical,
# investigate why they weren't label encoded and apply encoding as needed.

Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                0
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Rating color            0
Rating text             0
Votes                   0
dtype: int64
Column 'Has Table booking' contains non-numerical data.
['No' 'Yes']
Column 'Has Online delivery' contains non-numerical data.
['No' 'Yes']
Column 'Is delivering now' contains non-numerical data.
['No' 'Yes']
Column 'Switch to order menu' contains non-numerical data.
['No']


Initialize the model

In [9]:
model = RandomForestRegressor(random_state=42)

In [16]:
# Handle non-numerical data (example using LabelEncoder)
from sklearn.preprocessing import LabelEncoder
label_encoders = {}
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col])
        label_encoders[col] = le # Store the encoder for later use on the test set

# Apply the same encoding to the test set
for col in X_test.columns:
    if col in label_encoders:
        le = label_encoders[col]
        X_test[col] = le.transform(X_test[col])

# Now you can fit the model
model.fit(X_train, y_train)

Make predictions

In [17]:
y_pred = model.predict(X_test)

Evaluate the models


In [18]:
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))

Mean Squared Error: 0.029268875851231006
R-squared: 0.9872190112275845
