Build a regression model.

In [18]:
#imports
import requests
import pandas as pd
import numpy as np
import json
import os
from dotenv import load_dotenv
import statsmodels.api as sm
#from sklearn.metrics import mean_absolute_error, r2_score
#from sklearn.preprocessing import PolynomialFeatures
#from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

In [36]:
# Null Hypothesis is that there is no coorelation between the number of bikes at a given station and the chosen Foursquare and Yelp metrics

In [19]:
df_stations = pd.read_csv('df_stations.csv')
df_stations.head()

Unnamed: 0,station_id,station_name,total_bikes,mean_fsq_POI_popularity,mean_fsq_POI_price,mean_fsq_POI_rating,count_fsq_POIs,mean_yelp_price,mean_yelp_rating,mean_yelp_review_count,count_yelp_POIs
0,03712bfd188583696010813718ea01e1,P155 Colter St / Central Ave,12,0.954243,1.636364,8.05,26,1.875,3.913793,492.551724,29
1,07a67f4f4e097e3e5b85fd2bf6ab0809,P175 9th St. / Van Buren St,13,0.958142,1.75,8.175,4,2.0,4.03,234.92,50
2,07aa8e1d89b85fdf5e0fba51043ba0ed,P101 5th St / Grant St,10,0.716138,1.333333,6.7,11,2.151515,3.825581,315.674419,43
3,0cab138d1a9b55f141311a4a90e96d28,P168 5th St. and Washington,15,0.913278,1.892857,7.488235,37,2.065217,4.0,384.18,50
4,0d985cfd2b567116f790a1a7e31d5ad7,T09 Washington St / Center Pkwy,9,0.994972,,7.3,2,2.0,4.0,100.4,5


In [35]:
# View correlation matrix.  Correlation looks weak between the fields, and with the scatter plots in the 'joining_data' notebook also showing very weak correlation it's 
# safe to say that it will be difficult to build a model that successfully predicts the number of bikes at a given station.  But I'll try anyway...

df_stations[['total_bikes', 'mean_fsq_POI_popularity',
       'mean_fsq_POI_price', 'mean_fsq_POI_rating', 'count_fsq_POIs',
       'mean_yelp_price', 'mean_yelp_rating', 'mean_yelp_review_count',
       'count_yelp_POIs']].corr()[['total_bikes']]

Unnamed: 0,total_bikes
total_bikes,1.0
mean_fsq_POI_popularity,0.084491
mean_fsq_POI_price,-0.044155
mean_fsq_POI_rating,0.00382
count_fsq_POIs,0.028878
mean_yelp_price,-0.042323
mean_yelp_rating,0.023917
mean_yelp_review_count,-0.040234
count_yelp_POIs,0.055224


In [22]:
# Viewing not null value percentages
print(df_stations.count()/df_stations.shape[0])

station_id                 1.000000
station_name               1.000000
total_bikes                1.000000
mean_fsq_POI_popularity    0.938776
mean_fsq_POI_price         0.928571
mean_fsq_POI_rating        0.918367
count_fsq_POIs             1.000000
mean_yelp_price            0.969388
mean_yelp_rating           0.989796
mean_yelp_review_count     0.989796
count_yelp_POIs            1.000000
dtype: float64


In [30]:
# Filling null values with the mean in each column

df_stations['mean_fsq_POI_popularity'] = df_stations['mean_fsq_POI_popularity'].fillna(df_stations['mean_fsq_POI_popularity'].mean())
df_stations['mean_fsq_POI_price'] = df_stations['mean_fsq_POI_price'].fillna(df_stations['mean_fsq_POI_price'].mean())
df_stations['mean_fsq_POI_rating'] = df_stations['mean_fsq_POI_rating'].fillna(df_stations['mean_fsq_POI_rating'].mean())
df_stations['count_fsq_POIs'] = df_stations['count_fsq_POIs'].fillna(df_stations['count_fsq_POIs'].mean())
df_stations['mean_yelp_price'] = df_stations['mean_yelp_price'].fillna(df_stations['mean_yelp_price'].mean())
df_stations['mean_yelp_rating'] = df_stations['mean_yelp_rating'].fillna(df_stations['mean_yelp_rating'].mean())
df_stations['mean_yelp_review_count'] = df_stations['mean_yelp_review_count'].fillna(df_stations['mean_yelp_review_count'].mean())
df_stations['count_yelp_POIs'] = df_stations['count_yelp_POIs'].fillna(df_stations['count_yelp_POIs'].mean())

In [31]:
# Create a multivariate linear regression model

X = df_stations[['mean_fsq_POI_popularity',
       'mean_fsq_POI_price', 'mean_fsq_POI_rating', 'count_fsq_POIs',
       'mean_yelp_price', 'mean_yelp_rating', 'mean_yelp_review_count',
       'count_yelp_POIs']]

y = df_stations['total_bikes']

X = sm.add_constant(X) # adding a constant
lin_reg = sm.OLS(y,X)

In [32]:
# Print OLS Regression Results.  P values are all very high, so not a very effective model.  I will try only using 'mean_fsq_POI_price' and 'count_yelp_POIs' as they have the lowest P values.

model = lin_reg.fit()
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:            total_bikes   R-squared:                       0.020
Model:                            OLS   Adj. R-squared:                 -0.068
Method:                 Least Squares   F-statistic:                    0.2311
Date:                Sun, 09 Jul 2023   Prob (F-statistic):              0.984
Time:                        04:39:51   Log-Likelihood:                -285.33
No. Observations:                  98   AIC:                             588.7
Df Residuals:                      89   BIC:                             611.9
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [34]:
# Create the new multivariate linear regression model with only 2 columns.  There is a very slight improvement in P values and Adjusted R Squared, but not enough to call this an accurate model.

X2 = df_stations[['mean_fsq_POI_price','count_yelp_POIs']]
y = df_stations['total_bikes']


X2 = sm.add_constant(X2) # adding a constant
lin_reg2 = sm.OLS(y,X2)

model2 = lin_reg2.fit()
print_model2 = model2.summary()
print(print_model2)

                            OLS Regression Results                            
Dep. Variable:            total_bikes   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                 -0.011
Method:                 Least Squares   F-statistic:                    0.4584
Date:                Sun, 09 Jul 2023   Prob (F-statistic):              0.634
Time:                        04:44:25   Log-Likelihood:                -285.87
No. Observations:                  98   AIC:                             577.7
Df Residuals:                      95   BIC:                             585.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 12.7053      2

Provide model output and an interpretation of the results. 

In [None]:
# The model fails to reject the null hypothesis