In [15]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

#Load the dataset
data = pd.read_csv('/content/Dataset .csv')
print("Dataset Overview:")
print(data.head())
print(data.info())

# Handle missing values
data.fillna(data.mean(numeric_only=True), inplace=True)

# Step 4: Convert categorical columns with Yes/No values to numeric using Label Encoding
label_columns = ['Has_Table_booking', 'Has_Online_delivery']
encoder = LabelEncoder()
for col in label_columns:
    if col in data.columns:
        data[col] = encoder.fit_transform(data[col])  # Convert Yes/No to 1/0

# Encode other categorical variables using One-Hot Encoding
categorical_columns = ['Cuisines', 'City', 'Currency']  # Modify based on dataset
existing_categorical_columns = [col for col in categorical_columns if col in data.columns]
data = pd.get_dummies(data, columns=existing_categorical_columns, drop_first=True)

# Drop non-predictive features (Ensuring column names are correct)
columns_to_drop = ['Restaurant ID', 'Restaurant Name', 'Address', 'Locality','Locality Verbose', 'Rating color', 'Rating text']

existing_columns_to_drop = [col for col in columns_to_drop if col in data.columns]
X = data.drop(existing_columns_to_drop + ['Aggregate rating'], axis=1, errors='ignore')
y = data['Aggregate rating']

#  Convert features to numeric format
X = X.apply(pd.to_numeric, errors='coerce')

#  Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model selection
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print("Mean Squared Error (MSE):", mse)
print("R-squared Score (R²):", r2)

# Interpret the model's results
feature_importance = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['Importance'])

# Sort feature importance values
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Limit to Top 20 Features
top_n = 20
top_features = feature_importance.head(top_n)

print(top_features)

#  Save Feature Importance Analysis for further review
feature_importance.to_csv('Feature_Importance_RF.csv', index=True)
print("\nFeature importance saved to 'Feature_Importance_RF.csv'")

Dataset Overview:
   Restaurant ID         Restaurant Name  Country Code              City  \
0        6317637        Le Petit Souffle           162       Makati City   
1        6304287        Izakaya Kikufuji           162       Makati City   
2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   
3        6318506                    Ooma           162  Mandaluyong City   
4        6314302             Sambo Kojin           162  Mandaluyong City   

                                             Address  \
0  Third Floor, Century City Mall, Kalayaan Avenu...   
1  Little Tokyo, 2277 Chino Roces Avenue, Legaspi...   
2  Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...   
3  Third Floor, Mega Fashion Hall, SM Megamall, O...   
4  Third Floor, Mega Atrium, SM Megamall, Ortigas...   

                                     Locality  \
0   Century City Mall, Poblacion, Makati City   
1  Little Tokyo, Legaspi Village, Makati City   
2  Edsa Shangri-La, Ortigas, Mandaluyong 