Build a regression model.

In [14]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
import sqlite3
import statsmodels.api as sm

In [2]:
conn = sqlite3.connect('sm_app.sqlite')

In [18]:
r_df = pd.read_sql("select * from restaurant",conn)
r_df

Unnamed: 0,name,popularity,price,rating,categories_name,latitude,longitude,distance,credit_card,beer,outdoor_seating,bikes
0,Rorschach Brewing,0.982673,2.0,8.5,Brewery,43.663542,-79.319882,194,1.0,1.0,1.0,19
1,The Sidekick,0.928866,,8.8,Coffee Shop,43.664580,-79.324956,440,,,,19
2,O Sushi,0.933508,2.0,7.9,Pizzeria,43.666623,-79.316861,300,,,,19
3,Hasting Snack Bar,0.924879,1.0,9.0,Lounge,43.663791,-79.328899,759,1.0,,,19
4,Chino Locos,0.924573,1.0,8.2,Chinese Restaurant,43.664540,-79.325510,470,1.0,,0.0,19
...,...,...,...,...,...,...,...,...,...,...,...,...
1506,Sushi Masaki Saito,0.908064,2.0,,Sushi Restaurant,43.672351,-79.396057,977,,,,2
1507,Sheena's Place,,,,Restaurant,43.670236,-79.405073,636,,,,2
1508,Pink Pearl Restaurant,,,,Diner,43.673676,-79.396362,885,,,,2
1509,Cicchetti Ristorante Inc,,,,Restaurant,43.672312,-79.395817,976,,,,2


In [22]:
r_df.isna().sum()

name               0
popularity         0
price              0
rating             0
categories_name    0
latitude           0
longitude          0
distance           0
credit_card        0
beer               0
outdoor_seating    0
bikes              0
dtype: int64

In [25]:
r_df = r_df.dropna()

In [45]:
new_r_df = r_df[['popularity', 'price', 'rating', 'latitude','longitude','distance','credit_card','beer','outdoor_seating','bikes']]

In [67]:
#run full model
y = new_r_df['bikes']
X = [sm.add_constant(new_r_df[column]) for column in new_r_df.columns[:-2]] 
X[1]

Unnamed: 0,const,price
0,1.0,2.0
7,1.0,2.0
50,1.0,2.0
57,1.0,3.0
59,1.0,4.0
...,...,...
1428,1.0,4.0
1434,1.0,1.0
1442,1.0,1.0
1459,1.0,2.0


In [54]:
Models = [sm.OLS(y,x) for x in X] #list of models
Results = [model.fit() for model in Models] #list of results
Adj_Rsquared = [results.rsquared_adj for results in Results] #list of rsquared
Pval = [results.pvalues for results in Results] #list of p-values

In [56]:
for i in range(len(Adj_Rsquared)):
     print(f'adj_R2: {Adj_Rsquared[i]:.3f}, P-values: {*Pval[i],}, column: {new_r_df.columns[i+1]}')

adj_R2: 0.051, P-values: (0.006587233230975364, 0.0011065456692957649), column: price
adj_R2: -0.004, P-values: (6.0390536315646e-07, 0.6494090771930097), column: rating
adj_R2: 0.000, P-values: (0.40444673207253445, 0.30175969977356415), column: latitude
adj_R2: 0.288, P-values: (1.1853097714077483e-15, 1.2345270480455424e-15), column: longitude
adj_R2: 0.024, P-values: (0.019540507595710927, 0.019903716103803287), column: distance
adj_R2: -0.004, P-values: (6.364642634934967e-08, 0.578040973029797), column: credit_card
adj_R2: 0.002, P-values: (0.0008030929184452153, 0.23990405081019392), column: beer
adj_R2: -0.005, P-values: (0.013757128331905273, 0.6934736654155105), column: outdoor_seating


In [59]:
remaining_var = new_r_df.drop(['bikes', 'price'], axis=1)
remaining_var.head()

Unnamed: 0,popularity,rating,latitude,longitude,distance,credit_card,beer,outdoor_seating
0,0.982673,8.5,43.663542,-79.319882,194,1.0,1.0,1.0
7,0.957766,8.3,43.672615,-79.319396,804,1.0,1.0,1.0
50,0.955445,9.2,43.669324,-79.439163,581,1.0,1.0,0.0
57,0.955383,8.3,43.666321,-79.449531,654,1.0,1.0,0.0
59,0.977401,8.1,43.677954,-79.444009,733,1.0,1.0,0.0


In [61]:
included_df = new_r_df[['price']]
included_df

Unnamed: 0,price
0,2.0
7,2.0
50,2.0
57,3.0
59,4.0
...,...
1428,4.0
1434,1.0
1442,1.0
1459,2.0


In [63]:
X = [sm.add_constant(pd.merge(included_df,remaining_var[column], right_index = True, left_index = True)) for column in remaining_var.columns] 
X[2]

Unnamed: 0,const,price,latitude
0,1.0,2.0,43.663542
7,1.0,2.0,43.672615
50,1.0,2.0,43.669324
57,1.0,3.0,43.666321
59,1.0,4.0,43.677954
...,...,...,...
1428,1.0,4.0,43.650226
1434,1.0,1.0,43.650690
1442,1.0,1.0,43.647591
1459,1.0,2.0,43.641764


In [64]:
Models = [sm.OLS(y,x) for x in X] #list of models
Results = [model.fit() for model in Models] #list of results
Adj_Rsquared = [results.rsquared_adj for results in Results] #list of rsquared
Pval = [results.pvalues for results in Results] #list of list of p-values

for i in range(len(Adj_Rsquared)):
     print(f'adj_R2: {Adj_Rsquared[i]:.3f}, P-values: {*Pval[i],}, column: {remaining_var.columns[i]}')

adj_R2: 0.046, P-values: (0.006802831731712723, 0.9811174223662988, 0.0012753380634433504), column: popularity
adj_R2: -0.004, P-values: (0.4508889304989063, 0.7403801275788523, 0.32595206563795187), column: rating
adj_R2: 0.286, P-values: (1.3042424041973479e-15, 0.5433861946232563, 1.3535103827097235e-15), column: latitude
adj_R2: 0.020, P-values: (0.01810910708370656, 0.5429706418030718, 0.01842132316908993), column: longitude
adj_R2: -0.008, P-values: (0.0004881619807475304, 0.6446343896455398, 0.5746617246100736), column: distance
adj_R2: -0.003, P-values: (0.010329032445256799, 0.7145868772854558, 0.25386308350802167), column: credit_card
adj_R2: -0.009, P-values: (0.03644405942928034, 0.6723205826498517, 0.7206394459573208), column: beer
adj_R2: 0.009, P-values: (1.1185806361095099e-07, 0.677432201061535, 0.06061739802494905), column: outdoor_seating


Provide model output and an interpretation of the results. 

With an independent variable price, we had an adjusted R^2 of 0.051. After choose price and a second independent variable to predict our bike model, our next best adjusted R^2 is 0.046 with the independent variable popularity. Therefore, the best model for bikes is just price.


# Stretch

How can you turn the regression model into a classification model?