#### Build a regression model.

In [1]:
# imports
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Save joined CityBikes and Yelp info to CSV file
citybikes_yelp = pd.read_csv("data/citybikes_yelp.csv" , encoding = "unicode_escape")
df=citybikes_yelp.iloc[:,4:]
print(df.shape)
df.head()

(653, 10)


Unnamed: 0,free_bikes,empty_slots,entertainment,education,dining,park,shopping,metro_station,library,financial
0,17,1,7,6,48,8,36,0,1,2
1,14,1,2,8,46,4,55,0,2,5
2,3,20,7,5,42,8,37,0,1,1
3,11,14,72,88,434,29,392,9,11,128
4,7,8,34,58,369,22,331,3,12,62


### Forward Selection

In [2]:
# Model Building - Forward selection 

y = df['free_bikes']
indep = df.drop('free_bikes', axis=1)

X = [sm.add_constant(indep[column]) for column in indep.columns] 


In [3]:
Models = [sm.OLS(y,x) for x in X] #list of models
Results = [model.fit() for model in Models] #list of results
Adj_Rsquared = [results.rsquared_adj for results in Results] #list of rsquared
Pval = [results.pvalues for results in Results] #list of p-values
Params = [results.params for results in Results] #list of params

In [4]:
for i in range(len(Adj_Rsquared)):
     print(f'adj_R2: {Adj_Rsquared[i]:.3f}, P-values: {*Pval[i],}, column: {indep.columns[i]}')

adj_R2: 0.189, P-values: (7.250399732222864e-135, 1.2015051049185205e-31), column: empty_slots
adj_R2: 0.010, P-values: (8.594315508941139e-102, 0.006678565701212038), column: entertainment
adj_R2: -0.001, P-values: (5.4861782990624126e-114, 0.48801896126518796), column: education
adj_R2: 0.017, P-values: (7.260108754175347e-97, 0.0004807238279010314), column: dining
adj_R2: 0.025, P-values: (4.029670200870555e-84, 3.4022806756829385e-05), column: park
adj_R2: 0.005, P-values: (7.839039113386955e-93, 0.0341074329761797), column: shopping
adj_R2: 0.033, P-values: (6.901541680640921e-129, 1.892736312750246e-06), column: metro_station
adj_R2: -0.001, P-values: (2.7586060070045442e-121, 0.6615359612327425), column: library
adj_R2: 0.025, P-values: (2.6579628823317417e-111, 3.1023978532430104e-05), column: financial


In first step, empty_slots was selected as the first independent variable for modeling because of the highest adjacent R2.

In [5]:
remaining_var = df.drop(['free_bikes', 'empty_slots'], axis=1)
remaining_var.head()

included_df = df[['empty_slots']]
included_df

X = [sm.add_constant(pd.merge(included_df,remaining_var[column], right_index = True, left_index = True)) for column in remaining_var.columns] 

Models = [sm.OLS(y,x) for x in X] #list of models
Results = [model.fit() for model in Models] #list of results
Adj_Rsquared = [results.rsquared_adj for results in Results] #list of rsquared
Pval = [results.pvalues for results in Results] #list of list of p-values

for i in range(len(Adj_Rsquared)):
     print(f'adj_R2: {Adj_Rsquared[i]:.3f}, P-values: {*Pval[i],}, column: {remaining_var.columns[i]}')

adj_R2: 0.193, P-values: (6.046850979800018e-109, 6.074242925571167e-31, 0.037242514909466964), column: entertainment
adj_R2: 0.189, P-values: (1.1975266571379305e-119, 1.0159747949463244e-31, 0.3116165842820575), column: education
adj_R2: 0.196, P-values: (5.121388087864797e-105, 1.678875426152583e-30, 0.007516785925608776), column: dining
adj_R2: 0.203, P-values: (9.652850406191193e-99, 1.3094644133296531e-30, 0.00038154110925426953), column: park
adj_R2: 0.190, P-values: (2.9804360179540024e-105, 4.84018735563952e-31, 0.16514800092236473), column: shopping
adj_R2: 0.211, P-values: (5.728566792848702e-119, 7.689757026052558e-31, 1.1826937031285641e-05), column: metro_station
adj_R2: 0.188, P-values: (1.812390221713694e-122, 1.3124181434024066e-31, 0.6340169849371591), column: library
adj_R2: 0.204, P-values: (1.2782544683087102e-111, 1.1730625142812287e-30, 0.00030918558351811936), column: financial


In second step, metro_station was selected as the second independent variable for modeling because of the highest adjacent R2.

In [6]:
remaining_var = df.drop(['free_bikes', 'empty_slots','metro_station'], axis=1)
remaining_var.head()

included_df = df[['empty_slots']]
included_df

X = [sm.add_constant(pd.merge(included_df,remaining_var[column], right_index = True, left_index = True)) for column in remaining_var.columns] 

Models = [sm.OLS(y,x) for x in X] #list of models
Results = [model.fit() for model in Models] #list of results
Adj_Rsquared = [results.rsquared_adj for results in Results] #list of rsquared
Pval = [results.pvalues for results in Results] #list of list of p-values

for i in range(len(Adj_Rsquared)):
     print(f'adj_R2: {Adj_Rsquared[i]:.3f}, P-values: {*Pval[i],}, column: {remaining_var.columns[i]}')

adj_R2: 0.193, P-values: (6.046850979800018e-109, 6.074242925571167e-31, 0.037242514909466964), column: entertainment
adj_R2: 0.189, P-values: (1.1975266571379305e-119, 1.0159747949463244e-31, 0.3116165842820575), column: education
adj_R2: 0.196, P-values: (5.121388087864797e-105, 1.678875426152583e-30, 0.007516785925608776), column: dining
adj_R2: 0.203, P-values: (9.652850406191193e-99, 1.3094644133296531e-30, 0.00038154110925426953), column: park
adj_R2: 0.190, P-values: (2.9804360179540024e-105, 4.84018735563952e-31, 0.16514800092236473), column: shopping
adj_R2: 0.188, P-values: (1.812390221713694e-122, 1.3124181434024066e-31, 0.6340169849371591), column: library
adj_R2: 0.204, P-values: (1.2782544683087102e-111, 1.1730625142812287e-30, 0.00030918558351811936), column: financial


Because adjacent R2 increasing have stoped, modeling is stoped here.

### Backward Selection

In [7]:
# Model Building - Backward selection 
#run full model
y = df['free_bikes']
X = df.drop('free_bikes', axis=1)
X = sm.add_constant(X) #adds a column of 1's so the model will contain an intercept

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             free_bikes   R-squared:                       0.256
Model:                            OLS   Adj. R-squared:                  0.246
Method:                 Least Squares   F-statistic:                     24.59
Date:                Mon, 27 Feb 2023   Prob (F-statistic):           2.30e-36
Time:                        09:58:42   Log-Likelihood:                -1985.7
No. Observations:                 653   AIC:                             3991.
Df Residuals:                     643   BIC:                             4036.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            12.8124      0.498     25.731

In first step, independent variables with p-value higher than 0.05 were omitted.

Provide model output and an interpretation of the results. 

In [9]:
y = df['free_bikes']
X = df.drop(['free_bikes','entertainment','dining','park','shopping','metro_station','library','financial'], axis=1)
X = sm.add_constant(X) #adds a column of 1's so the model will contain an intercept

model = sm.OLS(y, X)
results = model.fit() #fit the model (this is where OLS is actually being run)
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             free_bikes   R-squared:                       0.191
Model:                            OLS   Adj. R-squared:                  0.189
Method:                 Least Squares   F-statistic:                     76.83
Date:                Mon, 27 Feb 2023   Prob (F-statistic):           1.12e-30
Time:                        10:01:16   Log-Likelihood:                -2013.0
No. Observations:                 653   AIC:                             4032.
Df Residuals:                     650   BIC:                             4045.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          13.1692      0.453     29.065      

As adjacent R2 increasing stopped, we stop modeling. As we observed all p-values are less than 0.05. It means empty_slots, education, and financial are effective on the target.

# Stretch

#### How can you turn the regression model into a classification model?

For classification modeling, the target should be a categorical variable. so we can change free_bikes to three groups: 

1- Low:    If     free_bikes   <= 0.35*capacity   
2- Medium: If     0.35*capacity < free_bikes   <= 0.70  capacity     
3- High:   If     0.70*capacity < free_bikes  

When capacity = free_bikes + empty_slots