# Initial Steps

In [7]:
"""

This notebook contains the linear regression models 

"""

'\n\nThis notebook contains the linear regression models \n\n'

In [13]:
# Import Libraries

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Models
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
import statsmodels.api as sm


# Metrics
from sklearn.metrics import r2_score


In [14]:
# Load Data 
df_hotels_POIs = pd.read_csv('hotels_and_POIs.csv')

In [15]:
# Checking data types and for any null values in numeric features
df_hotels_POIs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   hotel_id               200 non-null    object 
 1   hotel_name             200 non-null    object 
 2   hotel_review_count     200 non-null    int64  
 3   hotel_categories       200 non-null    object 
 4   hotel_rating           200 non-null    float64
 5   hotel_price            200 non-null    int64  
 6   hotel_distance         200 non-null    float64
 7   hotel_latitude         200 non-null    float64
 8   hotel_longitude        200 non-null    float64
 9   hotel_address1         199 non-null    object 
 10  hotel_city             200 non-null    object 
 11  hotel_zip_code         198 non-null    object 
 12  hotel_country          200 non-null    object 
 13  hotel_state            200 non-null    object 
 14  hotel_display_address  200 non-null    object 
 15  mean_p

# 1st Linear Regression Model - Predicting Hotel Rating

In [16]:
df_hotels_POIs.columns

Index(['hotel_id', 'hotel_name', 'hotel_review_count', 'hotel_categories',
       'hotel_rating', 'hotel_price', 'hotel_distance', 'hotel_latitude',
       'hotel_longitude', 'hotel_address1', 'hotel_city', 'hotel_zip_code',
       'hotel_country', 'hotel_state', 'hotel_display_address',
       'mean_poi_price', 'mean_poi_rating', 'mean_poi_review_count',
       'count_poi'],
      dtype='object')

In [26]:
# Prepare the data for regression testing with 9 independent variables

dependent_column = 'hotel_rating'

independent_columns = ['hotel_review_count',
       'hotel_price', 'hotel_distance', 'hotel_latitude',
       'hotel_longitude',
       'mean_poi_price', 'mean_poi_rating', 'mean_poi_review_count',
       'count_poi']

y = df_hotels_POIs[dependent_column]  # Dependent variable
X = df_hotels_POIs[independent_columns]  # Independent variables

# Add a constant term to the independent variables
X = sm.add_constant(X)

# Build the regression model
model = sm.OLS(y, X)

# Fit the regression model
results = model.fit()

# Print the results
print(results.summary())
print()
print('Largest p-value: ' + str(results.pvalues.max()))

                            OLS Regression Results                            
Dep. Variable:           hotel_rating   R-squared:                       0.141
Model:                            OLS   Adj. R-squared:                  0.100
Method:                 Least Squares   F-statistic:                     3.460
Date:                Fri, 14 Jul 2023   Prob (F-statistic):           0.000564
Time:                        04:05:03   Log-Likelihood:                -272.56
No. Observations:                 200   AIC:                             565.1
Df Residuals:                     190   BIC:                             598.1
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                  -252.34

In [36]:
# Remove feature with largest p-value (hotel_review_count) and run model again.  Repeat until all p-values are below the significance level (0.05).
# We are left with 2 independent variables: hotel_price and count_poi.

dependent_column = 'hotel_rating'

independent_columns = [
       'hotel_price',
       'count_poi']

y = df_hotels_POIs[dependent_column]  # Dependent variable
X = df_hotels_POIs[independent_columns]  # Independent variables

# Add a constant term to the independent variables
X = sm.add_constant(X)

# Build the regression model
model = sm.OLS(y, X)

# Fit the regression model
results = model.fit()

# Print the results
print(results.summary())
print()
print('Largest p-value: ' + str(results.pvalues.max()))

                            OLS Regression Results                            
Dep. Variable:           hotel_rating   R-squared:                       0.108
Model:                            OLS   Adj. R-squared:                  0.099
Method:                 Least Squares   F-statistic:                     11.90
Date:                Fri, 14 Jul 2023   Prob (F-statistic):           1.33e-05
Time:                        04:12:37   Log-Likelihood:                -276.34
No. Observations:                 200   AIC:                             558.7
Df Residuals:                     197   BIC:                             568.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           2.1042      0.282      7.468      

# 2nd Linear Regression Model - Predicting Hotel Price

In [39]:
# Prepare the data for regression testing with 9 independent variables

dependent_column = 'hotel_price'

independent_columns = ['hotel_review_count',
       'hotel_rating', 'hotel_distance', 'hotel_latitude',
       'hotel_longitude',
       'mean_poi_price', 'mean_poi_rating', 'mean_poi_review_count',
       'count_poi']

y = df_hotels_POIs[dependent_column]  # Dependent variable
X = df_hotels_POIs[independent_columns]  # Independent variables

# Add a constant term to the independent variables
X = sm.add_constant(X)

# Build the regression model
model = sm.OLS(y, X)

# Fit the regression model
results = model.fit()

# Print the results
print(results.summary())
print()
print('Largest p-value: ' + str(results.pvalues.max()))

                            OLS Regression Results                            
Dep. Variable:            hotel_price   R-squared:                       0.283
Model:                            OLS   Adj. R-squared:                  0.249
Method:                 Least Squares   F-statistic:                     8.342
Date:                Fri, 14 Jul 2023   Prob (F-statistic):           1.89e-10
Time:                        04:18:21   Log-Likelihood:                -128.20
No. Observations:                 200   AIC:                             276.4
Df Residuals:                     190   BIC:                             309.4
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                   -42.18

In [47]:
# Remove feature with largest p-value (hotel_review_count) and run model again.  Repeat until all p-values are below the significance level (0.05).
# We are left with 4 independent variables: 'hotel_review_count', 'hotel_rating','mean_poi_price','count_poi'.

dependent_column = 'hotel_price'

independent_columns = ['hotel_review_count',
       'hotel_rating',
       'mean_poi_price',
       'count_poi']

y = df_hotels_POIs[dependent_column]  # Dependent variable
X = df_hotels_POIs[independent_columns]  # Independent variables

# Add a constant term to the independent variables
X = sm.add_constant(X)

# Build the regression model
model = sm.OLS(y, X)

# Fit the regression model
results = model.fit()

# Print the results
print(results.summary())
print()
print('Largest p-value: ' + str(results.pvalues.max()))

                            OLS Regression Results                            
Dep. Variable:            hotel_price   R-squared:                       0.261
Model:                            OLS   Adj. R-squared:                  0.246
Method:                 Least Squares   F-statistic:                     17.25
Date:                Fri, 14 Jul 2023   Prob (F-statistic):           3.93e-12
Time:                        04:20:52   Log-Likelihood:                -131.21
No. Observations:                 200   AIC:                             272.4
Df Residuals:                     195   BIC:                             288.9
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                  0.9112      0

# 3rd Linear Regression Model - Predicting Hotel Review Count (as a measure of popularity)

In [48]:
# Prepare the data for regression testing with 9 independent variables

dependent_column = 'hotel_review_count'

independent_columns = ['hotel_price',
       'hotel_rating', 'hotel_distance', 'hotel_latitude',
       'hotel_longitude',
       'mean_poi_price', 'mean_poi_rating', 'mean_poi_review_count',
       'count_poi']

y = df_hotels_POIs[dependent_column]  # Dependent variable
X = df_hotels_POIs[independent_columns]  # Independent variables

# Add a constant term to the independent variables
X = sm.add_constant(X)

# Build the regression model
model = sm.OLS(y, X)

# Fit the regression model
results = model.fit()

# Print the results
print(results.summary())
print()
print('Largest p-value: ' + str(results.pvalues.max()))

                            OLS Regression Results                            
Dep. Variable:     hotel_review_count   R-squared:                       0.250
Model:                            OLS   Adj. R-squared:                  0.214
Method:                 Least Squares   F-statistic:                     7.037
Date:                Fri, 14 Jul 2023   Prob (F-statistic):           9.20e-09
Time:                        04:22:48   Log-Likelihood:                -1073.9
No. Observations:                 200   AIC:                             2168.
Df Residuals:                     190   BIC:                             2201.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                  4560.93

In [57]:
# Remove feature with largest p-value (hotel_review_count) and run model again.  Repeat until all p-values are below the significance level (0.05).
# We are left with 2 independent variables: 'hotel_price' and 'count_poi'.

dependent_column = 'hotel_review_count'

independent_columns = ['hotel_price','count_poi']

y = df_hotels_POIs[dependent_column]  # Dependent variable
X = df_hotels_POIs[independent_columns]  # Independent variables

# Add a constant term to the independent variables
X = sm.add_constant(X)

# Build the regression model
model = sm.OLS(y, X)

# Fit the regression model
results = model.fit()

# Print the results
print(results.summary())
print()
print('Largest p-value: ' + str(results.pvalues.max()))

                            OLS Regression Results                            
Dep. Variable:     hotel_review_count   R-squared:                       0.244
Model:                            OLS   Adj. R-squared:                  0.236
Method:                 Least Squares   F-statistic:                     31.73
Date:                Fri, 14 Jul 2023   Prob (F-statistic):           1.14e-12
Time:                        04:27:49   Log-Likelihood:                -1074.8
No. Observations:                 200   AIC:                             2156.
Df Residuals:                     197   BIC:                             2165.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const         -54.7634     15.264     -3.588      

# Conclusions

In [None]:
"""
The 3 models for hotel_price, hotel_rating, and hotel_review_count all have very small p-values and small R-squared values (less than 0.25).  This shows that there is definitely a relationship
between the independent and dependent variables, but that there are also factors at play that have not been included in this analysis.  

The next steps for this project would be to get better data on the hotel features (number of rooms, free internet, cleanliness, walkability score, etc.) to see if they could be used to 
build a model that better explains the variance in the dependent variable.

"""