In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, classification_report

In [None]:
# Load datasets
zomato_data = pd.read_csv('https://raw.githubusercontent.com/dsrscientist/dataset4/main/zomato.csv', encoding='latin-1')
country_data = pd.read_csv('https://raw.githubusercontent.com/dsrscientist/dataset4/main/country_code.csv')


In [None]:
# Merge datasets
data = pd.merge(zomato_data, country_data, on='Country Code', how='left')

In [None]:
# Display the first few rows of the combined dataset
print(data.head())

# Data Preprocessing
# Check for missing values
print(data.isnull().sum())

In [None]:
# Dropping irrelevant columns and rows with missing values
data.drop(columns=['Restaurant Id', 'Restaurant Name', 'Address', 'Locality Verbose', 'Longitude', 'Latitude', 'Currency'], inplace=True)
data.dropna(inplace=True)

In [None]:
# Convert 'Average Cost for two' to numeric
data['Average Cost for two'] = pd.to_numeric(data['Average Cost for two'], errors='coerce')


In [None]:
# Encode categorical variables
label_encoders = {}
for column in ['City', 'Cuisines', 'Country']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column].astype(str))
    label_encoders[column] = le


In [None]:
# Define features and target variables for regression and classification
X = data.drop(columns=['Average Cost for two', 'Price range'])
y_cost = data['Average Cost for two']
y_price = data['Price range']

In [None]:
# Splitting the data into training and testing sets
X_train, X_test, y_train_cost, y_test_cost = train_test_split(X, y_cost, test_size=0.2, random_state=42)
X_train_price, X_test_price, y_train_price, y_test_price = train_test_split(X, y_price, test_size=0.2, random_state=42)


In [None]:
# Model for predicting Average Cost for two
rf_cost = RandomForestRegressor(n_estimators=100, random_state=42)
rf_cost.fit(X_train, y_train_cost)
cost_predictions = rf_cost.predict(X_test)


In [None]:
# Evaluate regression model
print("Mean Squared Error for Average Cost for two:", mean_squared_error(y_test_cost, cost_predictions))

# Model for predicting Price range
rf_price = RandomForestClassifier(n_estimators=100, random_state=42)
rf_price.fit(X_train_price, y_train_price)
price_predictions = rf_price.predict(X_test_price)

In [None]:
# Evaluate classification model
print("Classification Report for Price range:\n", classification_report(y_test_price, price_predictions))

In [None]:
# Visualizing the feature importance for Average Cost prediction
feature_importances = rf_cost.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)


In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance for Average Cost Prediction')
plt.show()
