In [10]:
import sqlite3 as sq3
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

Build a regression model.

In [11]:
conn = sq3.connect('statmoddel_python_poi.db')
model_df = pd.read_sql_query("SELECT * from poi_df", conn)
model_df

Unnamed: 0,city,name,latitude,longitude,slots,empty_slots,no_of_bikes,bike_station_coordinates,name_x,category1,...,rating_x,yelp_bike_station_coordinates,name_y,rating_y,price_y,distance_y,category_1,category_2,category_3,review_count
0,Vancouver,10th & Cambie,49.262,-123.114,36.0,34.0,2.0,,,,...,,,,,,,,,,
1,Vancouver,Yaletown-Roundhouse Station,49.275,-123.122,16.0,8.0,7.0,"49.274566,-123.121817",The Flying Pig Yaletown,Restaurant,...,8.7,,,,,,,,,
2,Vancouver,Yaletown-Roundhouse Station,49.275,-123.122,16.0,8.0,7.0,"49.274566,-123.121817",MeeT in Yaletown,Vegan and Vegetarian Restaurant,...,8.3,,,,,,,,,
3,Vancouver,Yaletown-Roundhouse Station,49.275,-123.122,16.0,8.0,7.0,"49.273471,-123.118016",The Flying Pig Yaletown,Restaurant,...,8.7,,,,,,,,,
4,Vancouver,Yaletown-Roundhouse Station,49.275,-123.122,16.0,8.0,7.0,"49.273663,-123.127075",The Flying Pig Yaletown,Restaurant,...,8.7,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35255,,,49.263,-123.120,,,,,,,...,,"49.2856698,-123.1125432",Lotus Seed Vegan,4.0,$$,341.774806,Vegan,,,129.0
35256,,,49.263,-123.120,,,,,,,...,,"49.25094,-123.101306",Lotus Seed Vegan,4.0,$$,341.774806,Vegan,,,129.0
35257,,,49.263,-123.120,,,,,,,...,,"49.280977,-123.035969",Lotus Seed Vegan,4.0,$$,341.774806,Vegan,,,129.0
35258,,,49.263,-123.120,,,,,,,...,,"49.270783,-123.141564",Lotus Seed Vegan,4.0,$$,341.774806,Vegan,,,129.0


In [12]:
model_df = model_df.replace(np.nan,0)
model_df

Unnamed: 0,city,name,latitude,longitude,slots,empty_slots,no_of_bikes,bike_station_coordinates,name_x,category1,...,rating_x,yelp_bike_station_coordinates,name_y,rating_y,price_y,distance_y,category_1,category_2,category_3,review_count
0,Vancouver,10th & Cambie,49.262,-123.114,36.0,34.0,2.0,0,0,0,...,0.0,0,0,0.0,0,0.000000,0,0,0,0.0
1,Vancouver,Yaletown-Roundhouse Station,49.275,-123.122,16.0,8.0,7.0,"49.274566,-123.121817",The Flying Pig Yaletown,Restaurant,...,8.7,0,0,0.0,0,0.000000,0,0,0,0.0
2,Vancouver,Yaletown-Roundhouse Station,49.275,-123.122,16.0,8.0,7.0,"49.274566,-123.121817",MeeT in Yaletown,Vegan and Vegetarian Restaurant,...,8.3,0,0,0.0,0,0.000000,0,0,0,0.0
3,Vancouver,Yaletown-Roundhouse Station,49.275,-123.122,16.0,8.0,7.0,"49.273471,-123.118016",The Flying Pig Yaletown,Restaurant,...,8.7,0,0,0.0,0,0.000000,0,0,0,0.0
4,Vancouver,Yaletown-Roundhouse Station,49.275,-123.122,16.0,8.0,7.0,"49.273663,-123.127075",The Flying Pig Yaletown,Restaurant,...,8.7,0,0,0.0,0,0.000000,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35255,0,0,49.263,-123.120,0.0,0.0,0.0,0,0,0,...,0.0,"49.2856698,-123.1125432",Lotus Seed Vegan,4.0,$$,341.774806,Vegan,0,0,129.0
35256,0,0,49.263,-123.120,0.0,0.0,0.0,0,0,0,...,0.0,"49.25094,-123.101306",Lotus Seed Vegan,4.0,$$,341.774806,Vegan,0,0,129.0
35257,0,0,49.263,-123.120,0.0,0.0,0.0,0,0,0,...,0.0,"49.280977,-123.035969",Lotus Seed Vegan,4.0,$$,341.774806,Vegan,0,0,129.0
35258,0,0,49.263,-123.120,0.0,0.0,0.0,0,0,0,...,0.0,"49.270783,-123.141564",Lotus Seed Vegan,4.0,$$,341.774806,Vegan,0,0,129.0


In [13]:
#set the dependant variables by loading the dataframe
poi_data = model_df

#Prepare feature matrix and target variable
X = poi_data[['slots', 'no_of_bikes']]
y = poi_data['empty_slots']

# Add a constant term to the feature matrix
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X)
results = model.fit()

# Print the model summary
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:            empty_slots   R-squared:                       0.996
Model:                            OLS   Adj. R-squared:                  0.996
Method:                 Least Squares   F-statistic:                 3.967e+06
Date:                Sun, 03 Sep 2023   Prob (F-statistic):               0.00
Time:                        09:32:20   Log-Likelihood:                 21521.
No. Observations:               35260   AIC:                        -4.304e+04
Df Residuals:                   35257   BIC:                        -4.301e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           0.0003      0.001      0.473      

In [14]:
final_model_df = model_df.drop_duplicates(subset=['latitude', 'longitude'], keep='last')
final_model_df

Unnamed: 0,city,name,latitude,longitude,slots,empty_slots,no_of_bikes,bike_station_coordinates,name_x,category1,...,rating_x,yelp_bike_station_coordinates,name_y,rating_y,price_y,distance_y,category_1,category_2,category_3,review_count
0,Vancouver,10th & Cambie,49.262,-123.114,36.0,34.0,2.0,0,0,0,...,0.0,0,0,0.0,0,0.000000,0,0,0,0.0
11,Vancouver,Yaletown-Roundhouse Station,49.275,-123.122,16.0,8.0,7.0,"49.274232,-123.129998",The Flying Pig Yaletown,Restaurant,...,8.7,0,0,0.0,0,0.000000,0,0,0,0.0
32,Vancouver,Dunsmuir & Beatty,49.280,-123.110,26.0,11.0,14.0,"49.280787,-123.115271",Jam Cafe,Coffee Shop,...,8.8,0,0,0.0,0,0.000000,0,0,0,0.0
33,Vancouver,12th & Yukon (City Hall),49.261,-123.114,16.0,11.0,5.0,0,0,0,...,0.0,0,0,0.0,0,0.000000,0,0,0,0.0
34,Vancouver,8th & Ash,49.264,-123.118,16.0,9.0,6.0,0,0,0,...,0.0,0,0,0.0,0,0.000000,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34039,0,0,49.264,-123.114,0.0,0.0,0.0,0,0,0,...,0.0,"49.264019,-123.209176",Westcoast Poke,4.0,$$,194.291372,Hawaiian,Poke,Poke,172.0
34283,0,0,49.263,-123.112,0.0,0.0,0.0,0,0,0,...,0.0,"49.264019,-123.209176",Heritage Asian Eatery,4.0,$$,280.555404,Pan Asian,Chinese,Chinese,87.0
34771,0,0,49.263,-123.113,0.0,0.0,0.0,0,0,0,...,0.0,"49.264019,-123.209176",Sushi California,3.5,$$,263.365095,Japanese,Sushi Bars,Sushi Bars,317.0
35015,0,0,49.263,-123.122,0.0,0.0,0.0,0,0,0,...,0.0,"49.264019,-123.209176",Banana Leaf,4.0,$$,454.459866,Malaysian,Singaporean,Singaporean,300.0


In [15]:
#Running the model after dropping duplicates
poi_data = final_model_df

#Prepare feature matrix and target variable
X = poi_data[['slots', 'no_of_bikes']]
y = poi_data['empty_slots']

# Add a constant term to the feature matrix
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X)
results = model.fit()

# Print the model summary
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:            empty_slots   R-squared:                       0.982
Model:                            OLS   Adj. R-squared:                  0.982
Method:                 Least Squares   F-statistic:                 1.570e+04
Date:                Sun, 03 Sep 2023   Prob (F-statistic):               0.00
Time:                        09:32:44   Log-Likelihood:                -677.79
No. Observations:                 569   AIC:                             1362.
Df Residuals:                     566   BIC:                             1375.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          -0.0156      0.043     -0.363      

In [16]:
# Running the OLS model for the X variable (Dependant) 
# as Distance and Independant as Rating

X = final_model_df['distance_y']
y = final_model_df['rating_y']

X = sm.add_constant(X) # adding a constant
lin_reg = sm.OLS(y,X)

model = lin_reg.fit()
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:               rating_y   R-squared:                       0.783
Model:                            OLS   Adj. R-squared:                  0.782
Method:                 Least Squares   F-statistic:                     2040.
Date:                Sun, 03 Sep 2023   Prob (F-statistic):          5.47e-190
Time:                        09:33:14   Log-Likelihood:                -82.027
No. Observations:                 569   AIC:                             168.1
Df Residuals:                     567   BIC:                             176.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0220      0.012      1.859      0.0

Provide model output and an interpretation of the results. 

# Stretch

How can you turn the regression model into a classification model?