In [98]:
import numpy as np
import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm

In [99]:
yelp_dataframe_offline = pd.read_csv("../data/yelp_dataframe.csv")
fsq_dataframe_offline = pd.read_csv("../data/fsq_dataframe.csv")

Since our previous heatmap showed that there was little correlation between any of the variables of the dataframes we created leading up to this point, we're going to create a new variable that sort of reverse radius searches. So the below code creates a new reference dataframe for FSQ and Yelp that display the specific bars IDs that have multiple hits of nearby bikestations, i.e more than one bikestation within 1km.

In [100]:
yelp_multi_hits = pd.DataFrame(yelp_dataframe_offline['yelp_id'].value_counts()[yelp_dataframe_offline['yelp_id'].value_counts() > 1])
fsq_multi_hits = pd.DataFrame(fsq_dataframe_offline['fsq_id'].value_counts()[fsq_dataframe_offline['fsq_id'].value_counts() > 1])

Applies that reference dataframe for FSQ and Yelp to get a new dataframe that has a 'Nearby Bike Stations' column. In order to drop duplicates, the Bike Station ID and distance columns are dropped. Also, any Null values in either dataframe is replaced with zeroes.

In [None]:
yelp_with_counts = pd.DataFrame(yelp_dataframe_offline.merge(yelp_multi_hits, left_on='yelp_id', right_index=True))
yelp_with_counts.drop(columns=['bike_station_id', 'distance'], inplace=True)
yelp_with_counts.drop_duplicates(inplace=True)
yelp_with_counts.reset_index(drop=True, inplace=True)
yelp_with_counts.rename(columns={'count': 'nearby_bike_stations'}, inplace=True)
yelp_with_counts['price'].fillna(0, inplace=True)

In [None]:
fsq_with_counts = pd.DataFrame(fsq_dataframe_offline.merge(fsq_multi_hits, left_on='fsq_id', right_index=True))
fsq_with_counts.drop(columns=['bike_station_id', 'distance'], inplace=True)
fsq_with_counts.drop_duplicates(inplace=True)
fsq_with_counts.reset_index(drop=True, inplace=True)
fsq_with_counts.rename(columns={'count': 'nearby_bike_stations'}, inplace=True)
fsq_with_counts['total_ratings'] = fsq_with_counts['total_ratings'].fillna(0)
fsq_with_counts['rating'].fillna(0, inplace=True)
fsq_with_counts['price'].fillna(0, inplace=True)

Since many of the values in the price, rating, and total ratings columns contain null values that have been replaced with zeroes, we create seperate dataframes where we remove the any rows that contain zeroes, in order to have a filtered dataset to work with. We'll have to acknowledge that this will drop some of the results that had partial values filled in, in order to get a more narrowed focus multivariate regression output.

In [103]:
fsq_with_counts_no_zeroes = fsq_with_counts[(fsq_with_counts['rating'] != 0) | (fsq_with_counts['total_ratings'] != 0)]
fsq_with_counts_no_zeroes = fsq_with_counts_no_zeroes[fsq_with_counts_no_zeroes['price'] != 0]
fsq_with_counts_no_zeroes

Unnamed: 0,fsq_id,venue_name,category,address,price,rating,total_ratings,nearby_bike_stations
4,5a21c7e4ee712062bf95eab3,Kiitos Brewing,Bar,608 W 700 S,2.0,7.8,29.0,3
9,5df058d7c834950007db2cee,Bewilder Brewing Co.,Bar,445 S 400 W,2.0,7.5,8.0,10
11,58af7a6f34935540dc767706,Fisher Brewing Company,Bar,320 W 800 S,2.0,8.3,49.0,5
13,4c11aa8dd41e76b06ee9310d,EXTRA INNINGS - Sheraton Salt Lake City,Bar,150 W 500 S,2.0,5.4,12.0,2
23,4af1d93bf964a520bde321e3,Club Try-Angles,Gay Bar,251 W 900 S,1.0,6.8,63.0,4
...,...,...,...,...,...,...,...,...
184,4b330906f964a520341725e3,Bar Deluxe,Night Club,666 S State St,2.0,7.8,22.0,7
185,588c03967220e605c386ca91,Purgatory,Cocktail Bar,62 E 700 S,2.0,7.8,60.0,6
188,58704fd6e753561092637431,Water Witch,Cocktail Bar,163 W 900 S,3.0,8.9,29.0,4
192,5bd39846088158002c14ed17,Templin Family Brewing,Bar,936 S 300 W,2.0,8.9,38.0,3


In [104]:
yelp_with_counts_no_zeroes = yelp_with_counts[(yelp_with_counts['rating'] != 0) | (yelp_with_counts['total_ratings'] != 0)]
yelp_with_counts_no_zeroes = yelp_with_counts_no_zeroes[yelp_with_counts_no_zeroes['price'] != 0]
yelp_with_counts_no_zeroes

Unnamed: 0,yelp_id,venue_name,category,address,price,rating,total_ratings,nearby_bike_stations
0,zLBmNC_uNqkeVZDvbGW7iw,Slackwater - Salt Lake City,Pizza,684 S 500 W,2.0,4.6,562,5
3,XoN1n1_LgWzPmpJGqPCO-Q,Club X,Dance Clubs,445 S 400 W,2.0,3.8,5,12
4,oR5m2Yn5lRX22MLyJnMnUQ,Kiitos Brewing,Breweries,608 W 700th S,2.0,4.6,141,5
5,sSLN1cetv8Fc5_UVihwjMg,Woodbine Food Hall And Bar,Pizza,545 W 700 S,2.0,4.5,43,6
9,o61YFtgc7uekFG_w7GFUAA,Level Crossing Brewing Company,Pizza,550 S 300 W,2.0,4.5,41,11
...,...,...,...,...,...,...,...,...
176,BUbYe351xCCNV9cS2J0AAg,Rio Grande Cafe,Mexican,258 S 1300 E,2.0,3.6,263,2
177,1YA6GbJ5BY6OFcVFZIqBPQ,Cucina,Delis,1026 E 2nd Ave,2.0,4.1,262,2
178,HzqgVQppjNHjTAIbBB2q1g,Wing Wah Restaurant,Chinese,465 12th St,2.0,3.3,103,2
179,f2TkM9bmBuSSo6N76E1Zog,Lucky 13,Burgers,135 W 1300th S,2.0,4.0,2171,3


Build a regression model.

If running the code yourself, re-run the cell you wish to output the model for.

In [105]:
y = yelp_with_counts_no_zeroes['nearby_bike_stations']
X = yelp_with_counts_no_zeroes[['price', 'total_ratings', 'rating']]
X = sm.add_constant(X) # adds a column of 1's so the model will contain an intercept
model = sm.OLS(y, X)
results = model.fit()

In [107]:
y = fsq_with_counts_no_zeroes['nearby_bike_stations']
X = fsq_with_counts_no_zeroes[['price', 'total_ratings', 'rating']]
X = sm.add_constant(X) # adds a column of 1's so the model will contain an intercept
model = sm.OLS(y, X)
results = model.fit()

Provide model output and an interpretation of the results. 

(2 outputs provided here to analyze the two different models procuded.)

Below is the model for Yelp:

In [106]:
print(results.summary())

                             OLS Regression Results                             
Dep. Variable:     nearby_bike_stations   R-squared:                       0.089
Model:                              OLS   Adj. R-squared:                  0.062
Method:                   Least Squares   F-statistic:                     3.339
Date:                  Mon, 16 Dec 2024   Prob (F-statistic):             0.0222
Time:                          14:32:17   Log-Likelihood:                -345.13
No. Observations:                   107   AIC:                             698.3
Df Residuals:                       103   BIC:                             708.9
Df Model:                             3                                         
Covariance Type:              nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            18.7979  

---

and below is the model for FSQ:

In [108]:
print(results.summary())

                             OLS Regression Results                             
Dep. Variable:     nearby_bike_stations   R-squared:                       0.052
Model:                              OLS   Adj. R-squared:                  0.006
Method:                   Least Squares   F-statistic:                     1.141
Date:                  Mon, 16 Dec 2024   Prob (F-statistic):              0.339
Time:                          14:32:23   Log-Likelihood:                -215.11
No. Observations:                    67   AIC:                             438.2
Df Residuals:                        63   BIC:                             447.0
Df Model:                             3                                         
Covariance Type:              nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            15.9092  

Some of the takeaways we can interpret from these results:

    1. The amount of nearby bike stations in our Yelp dataframe seem to have little effect on what determines price or total ratings, however the model seems to suggest the actual rating of the bar is somewhat related to it. However, with a low R-squared result of our model the data is likely not very viable.

    2. Alternatively, the FSQ model seems to reject the nearby bike stations having any effect on price, rating, or total ratings.
    
    3. Coincedentally in both models, the coefficent of the rating seems to go down the more bike stations are nearby. Maybe these nearby pesky bikers are to blame! But the likely answer is that's just a minor coincedence.