# Task: Predictive Modeling

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Load the dataset
file_path = r'C:\Users\91702\Dataset.csv'
data = pd.read_csv(file_path)

# Dropping columns that are not useful for prediction
data_cleaned = data.drop(['Restaurant ID', 'Restaurant Name', 'Address', 'Locality', 
                          'Locality Verbose', 'Currency', 'Rating color', 'Rating text'], axis=1)

# Handle missing values
data_cleaned = data_cleaned.dropna()

# Convert categorical columns to numeric using LabelEncoder
label_enc = LabelEncoder()

categorical_cols = ['City', 'Cuisines', 'Has Table booking', 'Has Online delivery', 
                    'Is delivering now', 'Switch to order menu']

for col in categorical_cols:
    data_cleaned[col] = label_enc.fit_transform(data_cleaned[col])

# Separating features and target variable
X = data_cleaned.drop('Aggregate rating', axis=1)
y = data_cleaned['Aggregate rating']

# Splitting the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the models
linear_reg = LinearRegression()
decision_tree = DecisionTreeRegressor(random_state=42)
random_forest = RandomForestRegressor(random_state=42)

# Train the models
linear_reg.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = linear_reg.predict(X_test)
y_pred_dt = decision_tree.predict(X_test)
y_pred_rf = random_forest.predict(X_test)

# Evaluate the models
results = {
    "Model": ["Linear Regression", "Decision Tree", "Random Forest"],
    "MAE": [
        mean_absolute_error(y_test, y_pred_lr),
        mean_absolute_error(y_test, y_pred_dt),
        mean_absolute_error(y_test, y_pred_rf),
    ],
    "R2 Score": [
        r2_score(y_test, y_pred_lr),
        r2_score(y_test, y_pred_dt),
        r2_score(y_test, y_pred_rf),
    ],
}

results_df = pd.DataFrame(results)

# Print detailed results
print("Performance Comparison of Different Regression Models:\n")
print(results_df.to_string(index=False))
print("\nExplanation of Metrics:")
print("-" * 80)
print("1. Mean Absolute Error (MAE):")
print("   - This metric measures the average magnitude of errors between predicted and actual values.")
print("   - A lower MAE value indicates better performance.")
print("\n2. R² Score (Coefficient of Determination):")
print("   - This metric indicates how well the predicted values explain the variability in the actual data.")
print("   - An R² score closer to 1.0 indicates that the model explains most of the variance in the data.")
print("\nSummary:")
print("-" * 80)
print(f"The Random Forest model has the lowest MAE ({results_df.iloc[2, 1]:.3f}) and the highest R² score ({results_df.iloc[2, 2]:.3f}),")
print("making it the most accurate model for predicting the aggregate rating of a restaurant based on the available features.")


Performance Comparison of Different Regression Models:

            Model      MAE  R2 Score
Linear Regression 1.017931  0.335472
    Decision Tree 0.268989  0.922270
    Random Forest 0.193799  0.961704

Explanation of Metrics:
--------------------------------------------------------------------------------
1. Mean Absolute Error (MAE):
   - This metric measures the average magnitude of errors between predicted and actual values.
   - A lower MAE value indicates better performance.

2. R² Score (Coefficient of Determination):
   - This metric indicates how well the predicted values explain the variability in the actual data.
   - An R² score closer to 1.0 indicates that the model explains most of the variance in the data.

Summary:
--------------------------------------------------------------------------------
The Random Forest model has the lowest MAE (0.194) and the highest R² score (0.962),
making it the most accurate model for predicting the aggregate rating of a restaurant based