In [3]:
# --- Setup path so Jupyter can find src/ ---
import sys, os
sys.path.append(os.path.abspath(".."))

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load processed dataset
df = pd.read_csv("../data/processed/restaurant_ratings_clean.csv")

# Target and features
target = "Aggregate rating"
X = df.drop(columns=[target])
y = df[target]

# 🔧 Drop irrelevant / high-cardinality / leakage columns
drop_cols = [
    'Restaurant ID','Restaurant Name','Address','Locality',
    'Locality Verbose','Rating color','Rating text',
    'Switch to order menu','Country Code'
]
X = X.drop(columns=[c for c in drop_cols if c in X.columns], errors='ignore')

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print("Numeric Features:", numeric_features)
print("Categorical Features:", categorical_features)

# Preprocessing pipeline
try:
    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Models to compare
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

results = []

# Train/test split once
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for name, model in models.items():
    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Fit and predict
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)

    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, preds)
    results.append((name, rmse, r2))
    print(f"{name} → RMSE: {rmse:.3f}, R²: {r2:.3f}")

# Comparison table
results_df = pd.DataFrame(results, columns=["Model", "RMSE", "R2"]).set_index("Model")
results_df

Numeric Features: ['Longitude', 'Latitude', 'Average Cost for two', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Price range', 'Votes']
Categorical Features: ['City', 'Cuisines', 'Currency', 'PrimaryCuisine']
Linear Regression → RMSE: 1.229, R²: 0.336
Decision Tree → RMSE: 0.401, R²: 0.929
Random Forest → RMSE: 0.294, R²: 0.962


Unnamed: 0_level_0,RMSE,R2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Linear Regression,1.229173,0.336207
Decision Tree,0.40126,0.929261
Random Forest,0.294416,0.961917
