#### Build a regression model.

In [2]:
# imports
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Save joined CityBikes and Yelp info to CSV file
citybikes_yelp = pd.read_csv("data/citybikes_yelp.csv" , encoding = "unicode_escape")
df=citybikes_yelp.iloc[:,4:]
print(df.shape)
df.head()

(307, 10)


Unnamed: 0,free_bikes,empty_slots,entertainment,education,dining,park,shopping,metro_station,library,financial
0,1,10,17,16,10,3,78,3,3,3
1,22,0,1,6,6,0,53,0,0,1
2,2,19,2,3,5,1,26,0,0,0
3,1,16,2,5,5,0,44,0,0,0
4,3,12,3,6,7,1,45,0,1,0


### Forward Selection

In [3]:
# Model Building - Forward selection 

y = df['free_bikes']
indep = df.drop('free_bikes', axis=1)

X = [sm.add_constant(indep[column]) for column in indep.columns] 


In [4]:
Models = [sm.OLS(y,x) for x in X] #list of models
Results = [model.fit() for model in Models] #list of results
Adj_Rsquared = [results.rsquared_adj for results in Results] #list of rsquared
Pval = [results.pvalues for results in Results] #list of p-values
Params = [results.params for results in Results] #list of params

In [5]:
for i in range(len(Adj_Rsquared)):
     print(f'adj_R2: {Adj_Rsquared[i]:.3f}, P-values: {*Pval[i],}, column: {indep.columns[i]}')

adj_R2: 0.464, P-values: (1.5329980444236643e-86, 1.8670259356396407e-43), column: empty_slots
adj_R2: 0.036, P-values: (9.047896661893485e-50, 0.00045696427065135823), column: entertainment
adj_R2: 0.023, P-values: (5.187187787350413e-35, 0.0047232191591658765), column: education
adj_R2: 0.005, P-values: (8.538145556934891e-34, 0.1179474097457588), column: dining
adj_R2: 0.003, P-values: (2.333375448690082e-41, 0.15246883602198455), column: park
adj_R2: 0.047, P-values: (1.882465923968005e-39, 7.673029600106789e-05), column: shopping
adj_R2: 0.036, P-values: (5.100045517073713e-57, 0.0004975128329753588), column: metro_station
adj_R2: 0.049, P-values: (5.449245334222141e-58, 5.001457013654126e-05), column: library
adj_R2: -0.003, P-values: (1.2691788736390074e-35, 0.770533706645758), column: financial


In first step, empty_slots was selected as the first independent variable for modeling because of the highest adjacent R2.

In [6]:
remaining_var = df.drop(['free_bikes', 'empty_slots'], axis=1)
remaining_var.head()

included_df = df[['empty_slots']]
included_df

X = [sm.add_constant(pd.merge(included_df,remaining_var[column], right_index = True, left_index = True)) for column in remaining_var.columns] 

Models = [sm.OLS(y,x) for x in X] #list of models
Results = [model.fit() for model in Models] #list of results
Adj_Rsquared = [results.rsquared_adj for results in Results] #list of rsquared
Pval = [results.pvalues for results in Results] #list of list of p-values

for i in range(len(Adj_Rsquared)):
     print(f'adj_R2: {Adj_Rsquared[i]:.3f}, P-values: {*Pval[i],}, column: {remaining_var.columns[i]}')

adj_R2: 0.464, P-values: (8.882617273698315e-85, 9.081910662201474e-41, 0.45258664790135184), column: entertainment
adj_R2: 0.465, P-values: (5.041697543091679e-75, 6.628826853513123e-42, 0.22332499644443435), column: education
adj_R2: 0.465, P-values: (1.8891664930331334e-73, 4.8738136712771886e-43, 0.28253973625779916), column: dining
adj_R2: 0.465, P-values: (2.5404595252225188e-80, 3.74813650741558e-43, 0.25760878988477165), column: park
adj_R2: 0.466, P-values: (6.538626486727351e-78, 2.158000194250122e-40, 0.13930457638478908), column: shopping
adj_R2: 0.467, P-values: (7.3450585497112615e-87, 2.7973936422380954e-41, 0.0973746003333334), column: metro_station
adj_R2: 0.468, P-values: (7.044018641479511e-87, 2.2476967829315954e-40, 0.08790472971286437), column: library
adj_R2: 0.466, P-values: (9.237651115981775e-77, 9.807350293869735e-44, 0.15804306009412267), column: financial


In second step, empty_slots and metro_station were selected as the second independent variable for modeling because of the highest adjacent R2.

In [7]:
remaining_var = df.drop(['free_bikes', 'empty_slots','metro_station'], axis=1)
remaining_var.head()

included_df = df[['empty_slots']]
included_df

X = [sm.add_constant(pd.merge(included_df,remaining_var[column], right_index = True, left_index = True)) for column in remaining_var.columns] 

Models = [sm.OLS(y,x) for x in X] #list of models
Results = [model.fit() for model in Models] #list of results
Adj_Rsquared = [results.rsquared_adj for results in Results] #list of rsquared
Pval = [results.pvalues for results in Results] #list of list of p-values

for i in range(len(Adj_Rsquared)):
     print(f'adj_R2: {Adj_Rsquared[i]:.3f}, P-values: {*Pval[i],}, column: {remaining_var.columns[i]}')

adj_R2: 0.464, P-values: (8.882617273698315e-85, 9.081910662201474e-41, 0.45258664790135184), column: entertainment
adj_R2: 0.465, P-values: (5.041697543091679e-75, 6.628826853513123e-42, 0.22332499644443435), column: education
adj_R2: 0.465, P-values: (1.8891664930331334e-73, 4.8738136712771886e-43, 0.28253973625779916), column: dining
adj_R2: 0.465, P-values: (2.5404595252225188e-80, 3.74813650741558e-43, 0.25760878988477165), column: park
adj_R2: 0.466, P-values: (6.538626486727351e-78, 2.158000194250122e-40, 0.13930457638478908), column: shopping
adj_R2: 0.468, P-values: (7.044018641479511e-87, 2.2476967829315954e-40, 0.08790472971286437), column: library
adj_R2: 0.466, P-values: (9.237651115981775e-77, 9.807350293869735e-44, 0.15804306009412267), column: financial


Because adjacent R2 increasing have stoped, modeling is stoped here.

### Backward Selection

In [8]:
# Model Building - Backward selection 
#run full model
y = df['free_bikes']
X = df.drop('free_bikes', axis=1)
X = sm.add_constant(X) #adds a column of 1's so the model will contain an intercept

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             free_bikes   R-squared:                       0.497
Model:                            OLS   Adj. R-squared:                  0.482
Method:                 Least Squares   F-statistic:                     32.60
Date:                Mon, 27 Feb 2023   Prob (F-statistic):           1.56e-39
Time:                        09:41:15   Log-Likelihood:                -829.83
No. Observations:                 307   AIC:                             1680.
Df Residuals:                     297   BIC:                             1717.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            11.5125      0.565     20.360

In first step, independent variables with p-value higher than 0.05 were omitted.

Provide model output and an interpretation of the results. 

In [9]:
y = df['free_bikes']
X = df.drop(['free_bikes','entertainment','education','dining','park','metro_station','library'], axis=1)
X = sm.add_constant(X) #adds a column of 1's so the model will contain an intercept

model = sm.OLS(y, X)
results = model.fit() #fit the model (this is where OLS is actually being run)
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             free_bikes   R-squared:                       0.487
Model:                            OLS   Adj. R-squared:                  0.482
Method:                 Least Squares   F-statistic:                     95.93
Date:                Mon, 27 Feb 2023   Prob (F-statistic):           1.13e-43
Time:                        09:41:16   Log-Likelihood:                -832.80
No. Observations:                 307   AIC:                             1674.
Df Residuals:                     303   BIC:                             1689.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          11.5321      0.451     25.545      

As adjacent R2 increasing stopped, we stop modeling. As we observed all p-values are less than 0.05. It means empty_slots, shopping, and financial are effective on the target.

# Stretch

#### How can you turn the regression model into a classification model?

For classification modeling, the target should be a categorical variable. so we can change free_bikes to three groups: 

1- Low:    If     free_bikes   <= 0.35*capacity   
2- Medium: If     0.35*capacity < free_bikes   <= 0.70  capacity     
3- High:   If     0.70*capacity < free_bikes  

When capacity = free_bikes + empty_slots