In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
!pip install openpyxl



In [3]:
zomato_data = pd.read_csv("https://raw.githubusercontent.com/dsrscientist/dataset4/main/zomato.csv", encoding='latin1')
country_code = pd.read_excel("https://github.com/dsrscientist/dataset4/blob/main/Country-Code.xlsx?raw=true")

In [4]:
zomato_data = pd.merge(zomato_data, country_code, on="Country Code", how="left")

In [5]:
# Check the shape and first few rows of the merged DataFrame
print("Shape of merged DataFrame:", zomato_data.shape)
print("Columns of merged DataFrame:", zomato_data.columns)
print("First few rows of merged DataFrame:", zomato_data.head())

Shape of merged DataFrame: (9551, 22)
Columns of merged DataFrame: Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Currency', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes', 'Country'],
      dtype='object')
First few rows of merged DataFrame:    Restaurant ID         Restaurant Name  Country Code              City  \
0        6317637        Le Petit Souffle           162       Makati City   
1        6304287        Izakaya Kikufuji           162       Makati City   
2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   
3        6318506                    Ooma           162  Mandaluyong City   
4        6314302             Sambo Kojin           162  Mandaluyong City   

                        

In [6]:
features = ['Country name', 'City', 'Locality', 'Cuisines', 'Has Table booking', 
            'Has Online delivery', 'Aggregate Rating', 'Votes']

In [7]:
# Check if features are present in the DataFrame
missing_features = [feature for feature in features if feature not in zomato_data.columns]
if missing_features:
    print(f"The following features are missing in the DataFrame and will be ignored: {missing_features}")
    features = [feature for feature in features if feature not in missing_features]

The following features are missing in the DataFrame and will be ignored: ['Country name', 'Aggregate Rating']


In [8]:
# For predicting average cost for two
X_avg_cost = zomato_data[features]
y_avg_cost = zomato_data['Average Cost for two']

In [9]:
# For predicting price range
X_price_range = zomato_data[features]
y_price_range = zomato_data['Price range']

In [10]:
# Split the data into training and testing sets
X_avg_cost_train, X_avg_cost_test, y_avg_cost_train, y_avg_cost_test = train_test_split(X_avg_cost, y_avg_cost, test_size=0.2, random_state=42)
X_price_range_train, X_price_range_test, y_price_range_train, y_price_range_test = train_test_split(X_price_range, y_price_range, test_size=0.2, random_state=42)

In [11]:
# Convert categorical variables to numerical using one-hot encoding
X_avg_cost_train_encoded = pd.get_dummies(X_avg_cost_train)
X_avg_cost_test_encoded = pd.get_dummies(X_avg_cost_test)

In [12]:
# Convert categorical variables to numerical using one-hot encoding
X_avg_cost_train_encoded = pd.get_dummies(X_avg_cost_train)

# Train machine learning model
avg_cost_model = RandomForestRegressor(n_estimators=100, random_state=42)
avg_cost_model.fit(X_avg_cost_train_encoded, y_avg_cost_train)

In [15]:
# Get the list of feature names from the training data
train_features = X_avg_cost_train_encoded.columns.tolist()

# Filter the test data to include only the features present in the training data
X_avg_cost_test_encoded_filtered = X_avg_cost_test_encoded[train_features]

# Make predictions using the model
avg_cost_predictions = avg_cost_model.predict(X_avg_cost_test_encoded_filtered)
avg_cost_rmse = mean_squared_error(y_avg_cost_test, avg_cost_predictions, squared=False)
print("RMSE for Average Cost for Two:", avg_cost_rmse)


RMSE for Average Cost for Two: 14339.935438621782
