In [40]:
# Import necessary libraries for data manipulation and machine learning
import numpy as np                 # For numerical operations on arrays and matrices
import pandas as pd                # For data manipulation and analysis
import matplotlib.pyplot as plt    # For creating static visualizations
import seaborn as sns              # For statistical data visualization based on matplotlib

# Import modules for model evaluation and selection
from sklearn.model_selection import train_test_split, cross_val_score    # For splitting data and cross-validation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score    # For model evaluation metrics

# Import machine learning algorithms
from sklearn.linear_model import LinearRegression    # For linear regression
from sklearn.ensemble import RandomForestRegressor  # For random forest regression
from sklearn.svm import SVR                          # For support vector regression
import xgboost as xgb                                # For XGBoost regression
import lightgbm as lgb                                # For LightGBM regression
from catboost import CatBoostRegressor               # For CatBoost regression

# Import modules for advanced model stacking techniques
from sklearn.ensemble import StackingRegressor        # For stacking multiple regressors
from mlxtend.regressor import StackingCVRegressor     # For stacked generalization with cross-validation

# Import additional libraries for hyperparameter tuning
import optuna    # For hyperparameter optimization

# Import metrics for additional model evaluation
from sklearn import metrics

In [41]:
# Define the path to the CSV file containing the data
path = r'C:\Users\User\Desktop\Rashad\DATA\restaurant_data.csv'

# Read the CSV file into a pandas DataFrame
data = pd.read_csv(path)

# Display the DataFrame to view the loaded data
data

Unnamed: 0,Name,Location,Cuisine,Rating,Seating Capacity,Average Meal Price,Marketing Budget,Social Media Followers,Chef Experience Years,Number of Reviews,Avg Review Length,Ambience Score,Service Quality Score,Parking Availability,Weekend Reservations,Weekday Reservations,Revenue
0,Restaurant 0,Rural,Japanese,4.0,38,73.98,2224,23406,13,185,161.924906,1.3,7.0,Yes,13,4,638945.52
1,Restaurant 1,Downtown,Mexican,3.2,76,28.11,4416,42741,8,533,148.759717,2.6,3.4,Yes,48,6,490207.83
2,Restaurant 2,Rural,Italian,4.7,48,48.29,2796,37285,18,853,56.849189,5.3,6.7,No,27,14,541368.62
3,Restaurant 3,Rural,Italian,4.4,34,51.55,1167,15214,13,82,205.433265,4.6,2.8,Yes,9,17,404556.80
4,Restaurant 4,Downtown,Japanese,4.9,88,75.98,3639,40171,9,78,241.681584,8.6,2.1,No,37,26,1491046.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8363,Restaurant 8363,Suburban,Indian,3.4,54,34.85,1102,11298,11,380,253.919515,9.5,5.0,Yes,37,0,434653.45
8364,Restaurant 8364,Rural,Indian,3.7,49,36.88,1988,20432,9,713,175.590195,2.7,2.6,No,37,21,414977.92
8365,Restaurant 8365,Downtown,Italian,4.7,88,46.87,5949,63945,6,436,222.953647,4.8,1.7,Yes,83,21,930395.87
8366,Restaurant 8366,Rural,American,3.1,31,44.53,707,7170,1,729,178.482851,6.1,2.1,No,6,21,311493.48


In [42]:
data['Location'].value_counts()

Location
Downtown    2821
Suburban    2785
Rural       2762
Name: count, dtype: int64

In [43]:
data['Cuisine'].value_counts()

Cuisine
French      1433
American    1416
Italian     1413
Mexican     1393
Indian      1369
Japanese    1344
Name: count, dtype: int64

In [44]:
df_encoded = pd.get_dummies(data, columns=["Location", "Cuisine"], dtype=int)

In [45]:
df_encoded['Parking Availability'] = df_encoded['Parking Availability'].map({'yes': 1, 'no': 0})

In [46]:
df_encoded

Unnamed: 0,Name,Rating,Seating Capacity,Average Meal Price,Marketing Budget,Social Media Followers,Chef Experience Years,Number of Reviews,Avg Review Length,Ambience Score,...,Revenue,Location_Downtown,Location_Rural,Location_Suburban,Cuisine_American,Cuisine_French,Cuisine_Indian,Cuisine_Italian,Cuisine_Japanese,Cuisine_Mexican
0,Restaurant 0,4.0,38,73.98,2224,23406,13,185,161.924906,1.3,...,638945.52,0,1,0,0,0,0,0,1,0
1,Restaurant 1,3.2,76,28.11,4416,42741,8,533,148.759717,2.6,...,490207.83,1,0,0,0,0,0,0,0,1
2,Restaurant 2,4.7,48,48.29,2796,37285,18,853,56.849189,5.3,...,541368.62,0,1,0,0,0,0,1,0,0
3,Restaurant 3,4.4,34,51.55,1167,15214,13,82,205.433265,4.6,...,404556.80,0,1,0,0,0,0,1,0,0
4,Restaurant 4,4.9,88,75.98,3639,40171,9,78,241.681584,8.6,...,1491046.35,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8363,Restaurant 8363,3.4,54,34.85,1102,11298,11,380,253.919515,9.5,...,434653.45,0,0,1,0,0,1,0,0,0
8364,Restaurant 8364,3.7,49,36.88,1988,20432,9,713,175.590195,2.7,...,414977.92,0,1,0,0,0,1,0,0,0
8365,Restaurant 8365,4.7,88,46.87,5949,63945,6,436,222.953647,4.8,...,930395.87,1,0,0,0,0,0,1,0,0
8366,Restaurant 8366,3.1,31,44.53,707,7170,1,729,178.482851,6.1,...,311493.48,0,1,0,1,0,0,0,0,0


In [47]:
df_encoded.columns

Index(['Name', 'Rating', 'Seating Capacity', 'Average Meal Price',
       'Marketing Budget', 'Social Media Followers', 'Chef Experience Years',
       'Number of Reviews', 'Avg Review Length', 'Ambience Score',
       'Service Quality Score', 'Parking Availability', 'Weekend Reservations',
       'Weekday Reservations', 'Revenue', 'Location_Downtown',
       'Location_Rural', 'Location_Suburban', 'Cuisine_American',
       'Cuisine_French', 'Cuisine_Indian', 'Cuisine_Italian',
       'Cuisine_Japanese', 'Cuisine_Mexican'],
      dtype='object')

In [48]:
df_encoded.drop('Name',axis=1,inplace=True)

In [49]:
# Separate the input features (independent variables) from the target variable (dependent variable)
inputs = df_encoded.drop('Revenue', axis=1)  # Input features (excluding 'Life expectancy' column)
output = df_encoded['Revenue']              # Target variable ('Life expectancy' column)

# Split the data into training and testing sets
# The training set will be used to train the model, and the testing set will be used to evaluate its performance
# The test_size parameter specifies the proportion of the dataset to include in the testing set
# The random_state parameter ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(inputs, output, test_size=0.3, random_state=42)

In [50]:
models = []

# Define default models for each algorithm
xgb_model_def = xgb.XGBRegressor()
lgb_model_def = lgb.LGBMRegressor()
catboost_model_def = CatBoostRegressor()

# Define models for stacking
stacking_models = [('XGBoost', xgb_model_def),
                   ('LightGBM', lgb_model_def),
                   ('CatBoost', catboost_model_def)
                   ]

# Extend models list with default models
models.extend([
    ('XGBoost', xgb_model_def),
    ('LightGBM', lgb_model_def),
    ('CatBoost', catboost_model_def)
])

In [51]:
def train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test):
    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict labels on the training data
    y_train_pred = model.predict(X_train)

    # Predict labels on the testing data
    y_test_pred = model.predict(X_test)

    # Calculate evaluation metrics for training data
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)

    # Calculate evaluation metrics for testing data
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    # Print model performance metrics
    print(f'Model Performance for {model_name}')
    print('Training Data:')
    print('Mean Absolute Error:', train_mae)
    print('Mean Squared Error:', train_mse)
    print('R-squared:', train_r2)
    print('Testing Data:')
    print('Mean Absolute Error:', test_mae)
    print('Mean Squared Error:', test_mse)
    print('R-squared:', test_r2)

    return train_r2, test_r2

In [52]:
# Initialize the DataFrame
r2_df = pd.DataFrame(columns=['Model', 'Train R2', 'Test R2'])

# Train and evaluate each model
for model_name, model in models:
    train_r2, test_r2 = train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test)
        
    r2_df = pd.concat([r2_df, pd.DataFrame({'Model': [model_name], 'Train R2': [train_r2], 'Test R2': [test_r2]})], ignore_index=True)

r2_df_sorted = r2_df.sort_values(by='Test R2', ascending=False)


Model Performance for XGBoost
Training Data:
Mean Absolute Error: 2094.455318315691
Mean Squared Error: 7857959.6673647445
R-squared: 0.999890288081818
Testing Data:
Mean Absolute Error: 6329.524530067702
Mean Squared Error: 68518162.3732269
R-squared: 0.9990377369575844
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001424 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1758
[LightGBM] [Info] Number of data points in the train set: 5857, number of used features: 21
[LightGBM] [Info] Start training from score 657299.027232
Model Performance for LightGBM
Training Data:
Mean Absolute Error: 3414.758624065742
Mean Squared Error: 19210981.75178441
R-squared: 0.9997317785090065
Testing Data:
Mean Absolute Error: 5030.424944867468
Mean Squared Error: 42713341.98559954
R-squared: 0.99940013758415
Learning rate set to 0.054134
0:	learn: 254963

In [53]:
r2_df_sorted

Unnamed: 0,Model,Train R2,Test R2
2,CatBoost,0.99994,0.999872
1,LightGBM,0.999732,0.9994
0,XGBoost,0.99989,0.999038
