# Task: Predictive Modeling

In [22]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

import pandas as pd
df = pd.read_csv('Dataset .csv')

### Step 1: Preprocessing


In [23]:
# Drop columns that are irrelevant or that you won't use for prediction
df = df.drop(['Restaurant ID', 'Restaurant Name', 'City', 'Address', 'Locality', 'Locality Verbose'], axis=1)

In [24]:
# Handle missing values (filling with the mode or mean as necessary)
df['Cuisines'] = df['Cuisines'].fillna(df['Cuisines'].mode()[0])
missing_values = df.isnull().sum()
print(missing_values)

Country Code            0
Longitude               0
Latitude                0
Cuisines                0
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64


In [25]:
# Encode categorical variables using one-hot encoding
df = pd.get_dummies(df, drop_first=True)

In [26]:
# Define features (X) and target variable (y)
X = df.drop(['Aggregate rating'], axis=1)  # Features
y = df['Aggregate rating']  # Target variable

### Step 2: Spliting data into training and testing sets

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Step 3: Feature Scaling (for models that require it, like Linear Regression)

In [28]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Step 4: Train different models

In [29]:
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

### Step 5: Make Predictions

In [30]:
# Predictions using Linear Regression
y_pred_lr = lr_model.predict(X_test_scaled)

# Predictions using Decision Tree Regressor
y_pred_dt = dt_model.predict(X_test)

# Predictions using Random Forest Regressor
y_pred_rf = rf_model.predict(X_test)

### Step 6: Evaluate the models

In [31]:
# Calculate metrics for Linear Regression
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_mae = mean_absolute_error(y_test, y_pred_lr)
lr_r2 = r2_score(y_test, y_pred_lr)

# Calculate metrics for Decision Tree Regressor
dt_mse = mean_squared_error(y_test, y_pred_dt)
dt_mae = mean_absolute_error(y_test, y_pred_dt)
dt_r2 = r2_score(y_test, y_pred_dt)

# Calculate metrics for Random Forest Regressor
rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_r2 = r2_score(y_test, y_pred_rf)

### Step 7: Print the results

In [32]:
print("Linear Regression:")
print(f"Mean Squared Error: {lr_mse}")
print(f"Mean Absolute Error: {lr_mae}")
print(f"R-squared: {lr_r2}\n")

print("Decision Tree Regressor:")
print(f"Mean Squared Error: {dt_mse}")
print(f"Mean Absolute Error: {dt_mae}")
print(f"R-squared: {dt_r2}\n")

print("Random Forest Regressor:")
print(f"Mean Squared Error: {rf_mse}")
print(f"Mean Absolute Error: {rf_mae}")
print(f"R-squared: {rf_r2}\n")

Linear Regression:
Mean Squared Error: 1.4339619794957914e+23
Mean Absolute Error: 74594902286.35455
R-squared: -6.3000585015958086e+22

Decision Tree Regressor:
Mean Squared Error: 0.055960230245944514
Mean Absolute Error: 0.14610151753008896
R-squared: 0.9754140814503195

Random Forest Regressor:
Mean Squared Error: 0.029603042825745635
Mean Absolute Error: 0.11486457352171625
R-squared: 0.9869940134889057



### Step 8: Comparison

In [33]:
# Create a DataFrame to compare the model performance
comparison_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest'],
    'Mean Squared Error (MSE)': [lr_mse, dt_mse, rf_mse],
    'Mean Absolute Error (MAE)': [lr_mae, dt_mae, rf_mae],
    'R-squared': [lr_r2, dt_r2, rf_r2]
})

# Display the comparison table
print(comparison_df)

# Sort the comparison table by R-squared (descending) to identify the best model
comparison_df_sorted = comparison_df.sort_values(by='R-squared', ascending=False)

# Display the sorted comparison table
print("\nComparison Sorted by R-squared:")
print(comparison_df_sorted)

               Model  Mean Squared Error (MSE)  Mean Absolute Error (MAE)  \
0  Linear Regression              1.433962e+23               7.459490e+10   
1      Decision Tree              5.596023e-02               1.461015e-01   
2      Random Forest              2.960304e-02               1.148646e-01   

      R-squared  
0 -6.300059e+22  
1  9.754141e-01  
2  9.869940e-01  

Comparison Sorted by R-squared:
               Model  Mean Squared Error (MSE)  Mean Absolute Error (MAE)  \
2      Random Forest              2.960304e-02               1.148646e-01   
1      Decision Tree              5.596023e-02               1.461015e-01   
0  Linear Regression              1.433962e+23               7.459490e+10   

      R-squared  
2  9.869940e-01  
1  9.754141e-01  
0 -6.300059e+22  
