In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Step 1: Load and Explore Data
df = pd.read_csv('restaurant_data.csv')

# Display first few rows and check columns
print(df.head())
print(df.columns)

# Step 2: Data Preprocessing

# Check for missing values
print(df.isnull().sum())

# Handle missing values if any
# Example: df.fillna(0, inplace=True) or df.dropna(inplace=True)

# Select features and target variable
X = df.drop('aggregate_rating', axis=1)  # Features
y = df['aggregate_rating']  # Target variable

# Step 3: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Model Selection and Training

# Linear Regression
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

# Decision Tree Regression
model_dt = DecisionTreeRegressor(random_state=42)
model_dt.fit(X_train, y_train)

# Random Forest Regression
model_rf = RandomForestRegressor(random_state=42)
model_rf.fit(X_train, y_train)

# Step 5: Model Evaluation

# Function to evaluate model and print metrics
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print(f'Mean Squared Error: {mse:.2f}')
    print(f'Root Mean Squared Error: {rmse:.2f}')
    print(f'R^2 Score: {r2:.2f}')

# Evaluate Linear Regression model
print("Linear Regression:")
evaluate_model(model_lr, X_test, y_test)

# Evaluate Decision Tree model
print("\nDecision Tree Regression:")
evaluate_model(model_dt, X_test, y_test)

# Evaluate Random Forest model
print("\nRandom Forest Regression:")
evaluate_model(model_rf, X_test, y_test)

# Step 6: Model Comparison (Optional)

# Compare model performances visually (example for Random Forest)
y_pred_rf = model_rf.predict(X_test)
plt.scatter(y_test, y_pred_rf)
plt.xlabel('Actual Ratings')
plt.ylabel('Predicted Ratings')
plt.title('Actual vs Predicted Ratings (Random Forest)')
plt.show()


ModuleNotFoundError: No module named 'sklearn'