In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [None]:
# Load datasets
url_country_code = "https://github.com/dsrscientist/dataset4/raw/main/Country-Code.xlsx"
url_zomato = "https://github.com/dsrscientist/dataset4/raw/main/zomato.csv"

In [None]:
country_code = pd.read_excel(url_country_code)
zomato = pd.read_csv(url_zomato)

In [None]:
# Merge datasets on Country Code
data = pd.merge(zomato, country_code, how='left', left_on='Country Code', right_on='Country Code')


In [None]:
# Drop unnecessary columns
data.drop(columns=['Restaurant ID', 'Restaurant Name', 'City', 'Address', 'Locality', 'Locality Verbose',
                   'Currency', 'Rating color', 'Rating text', 'Country Code'], inplace=True)


In [None]:
# Feature engineering
# Converting 'Has Table booking', 'Has Online delivery', 'Is delivering', 'Switch to order menu' to binary values
data.replace({'yes': 1, 'no': 0}, inplace=True)


In [None]:
# Handling missing values
data.dropna(inplace=True)


In [None]:
# Splitting the data into features (X) and target variables (y)
X = data.drop(columns=['Average Cost for two', 'Price range'])
y_avg_cost = data['Average Cost for two']
y_price_range = data['Price range']


In [None]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train_avg_cost, y_test_avg_cost, y_train_price_range, y_test_price_range = train_test_split(
    X, y_avg_cost, y_price_range, test_size=0.2, random_state=42
)

In [None]:
# Preprocessing pipeline
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns


In [None]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


In [None]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [None]:
# Regression model for Average Cost for two
avg_cost_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('reg', RandomForestRegressor())
])


In [None]:
# Hyperparameter tuning for Average Cost for two
param_grid_avg_cost = {
    'reg__n_estimators': [50, 100, 200],
    'reg__max_depth': [None, 5, 10],
    'reg__min_samples_split': [2, 5, 10],
}

In [None]:
grid_search_avg_cost = GridSearchCV(avg_cost_model, param_grid_avg_cost, cv=5, scoring='neg_mean_squared_error')
grid_search_avg_cost.fit(X_train, y_train_avg_cost)


In [None]:
# Best parameters and best score for Average Cost for two
print("Best Parameters (Average Cost for two):", grid_search_avg_cost.best_params_)
print("Best Score (Average Cost for two):", -grid_search_avg_cost.best_score_)


In [None]:
# Regression model for Price range
price_range_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('reg', RandomForestRegressor())
])


In [None]:
# Hyperparameter tuning for Price range
param_grid_price_range = {
    'reg__n_estimators': [50, 100, 200],
    'reg__max_depth': [None, 5, 10],
    'reg__min_samples_split': [2, 5, 10],
}


In [None]:
grid_search_price_range = GridSearchCV(price_range_model, param_grid_price_range, cv=5, scoring='neg_mean_squared_error')
grid_search_price_range.fit(X_train, y_train_price_range)

In [None]:
# Best parameters and best score for Price range
print("Best Parameters (Price range):", grid_search_price_range.best_params_)
print("Best Score (Price range):", -grid_search_price_range.best_score_)

In [None]:
# Evaluating the models on the test set
best_avg_cost_model = grid_search_avg_cost.best_estimator_
y_pred_avg_cost = best_avg_cost_model.predict(X_test)
mse_avg_cost = mean_squared_error(y_test_avg_cost, y_pred_avg_cost)
print("Mean Squared Error (Average Cost for two):", mse_avg_cost)
