In [1]:
from typing import List
from typing import Tuple
from typing import Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf

from tqdm import tqdm

sns.set(font_scale=1.5)
sns.set_style("whitegrid", {'grid.linestyle':'--'})

In [2]:
data = pd.read_csv("regression_new_2.csv")
data

Unnamed: 0,zipcode,distance_km,number_of_stations,Income,changes,2019_citibike,2020_citibike,station_all
0,10001,3.734991,17,101409,22,272,146,14
1,10002,6.552647,4,37093,2,27,18,26
2,10003,5.668975,13,137533,27,71,121,22
3,10004,8.138366,3,216017,3,6,2,7
4,10005,9.064966,3,197188,1,2,4,4
5,10007,8.486694,7,250001,1,10,11,8
6,10009,5.61492,8,77551,7,43,23,17
7,10010,4.513504,11,131635,3,78,74,8
8,10011,4.760938,18,139343,0,140,133,21
9,10012,6.474959,8,117581,1,33,37,9


In [3]:
data.rename(columns={'2019_citibike': 'citibike_2019', '2020_citibike': 'citibike_2020'}, inplace=True)
data.to_csv('regression_modified_file.csv', index=False)
data

Unnamed: 0,zipcode,distance_km,number_of_stations,Income,changes,citibike_2019,citibike_2020,station_all
0,10001,3.734991,17,101409,22,272,146,14
1,10002,6.552647,4,37093,2,27,18,26
2,10003,5.668975,13,137533,27,71,121,22
3,10004,8.138366,3,216017,3,6,2,7
4,10005,9.064966,3,197188,1,2,4,4
5,10007,8.486694,7,250001,1,10,11,8
6,10009,5.61492,8,77551,7,43,23,17
7,10010,4.513504,11,131635,3,78,74,8
8,10011,4.760938,18,139343,0,140,133,21
9,10012,6.474959,8,117581,1,33,37,9


In [4]:
# Retrieve data as a numpy array
Y = data[["citibike_2019"]].values
print(Y.shape)
Y[:5]

(35, 1)


array([[70],
       [ 3],
       [16],
       [ 3],
       [ 2]])

In [5]:
X = data[["distance_km", "number_of_stations","Income"]].values
X = np.append(np.ones((X.shape[0], 1)), X, axis=1)
print(X.shape)
X[:5]

(35, 4)


array([[1.000000e+00, 3.734991e+00, 1.700000e+01, 1.014090e+05],
       [1.000000e+00, 6.552647e+00, 4.000000e+00, 3.709300e+04],
       [1.000000e+00, 5.668975e+00, 1.300000e+01, 1.375330e+05],
       [1.000000e+00, 8.138366e+00, 3.000000e+00, 2.160170e+05],
       [1.000000e+00, 9.064966e+00, 3.000000e+00, 1.971880e+05]])

In [6]:
# Calculates coefficients
betas = np.linalg.inv(X.T @ X) @ X.T @ Y
betas

array([[-1.13246595e+01],
       [-6.94073462e+00],
       [ 7.16902476e+00],
       [ 1.41306619e-04]])

In [7]:
# Calculates the standard error
Y_hat = X @ betas
residual = Y - Y_hat
var = np.var(residual, ddof=X.shape[1])

se = np.sqrt(var * np.linalg.inv(X.T @ X))
se

  se = np.sqrt(var * np.linalg.inv(X.T @ X))


array([[5.27825545e+01,            nan,            nan,            nan],
       [           nan, 6.08282443e+00, 3.11511289e+00,            nan],
       [           nan, 3.11511289e+00, 2.37841423e+00, 2.62623577e-03],
       [           nan,            nan, 2.62623577e-03, 2.18484632e-04]])

In [8]:
# Calculates R2
r2 = np.power(Y_hat - np.mean(Y), 2).sum() / np.power(Y - np.mean(Y), 2).sum()
r2

0.4912450468628188

In [9]:
# Multicollinearity 2019
model_3 = smf.ols(formula=' citibike_2019 ~ number_of_stations + distance_km + Income ', data=data)
result_3 = model_3.fit()
print(result_3.summary())

                            OLS Regression Results                            
Dep. Variable:          citibike_2019   R-squared:                       0.491
Model:                            OLS   Adj. R-squared:                  0.442
Method:                 Least Squares   F-statistic:                     9.978
Date:                Mon, 22 Apr 2024   Prob (F-statistic):           9.28e-05
Time:                        12:47:57   Log-Likelihood:                -190.96
No. Observations:                  35   AIC:                             389.9
Df Residuals:                      31   BIC:                             396.1
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept            -11.3247     52

In [10]:
# 2020
model_4 = smf.ols(formula=' citibike_2020 ~ number_of_stations + distance_km + Income ', data=data)
result_4 = model_4.fit()
print(result_4.summary())

                            OLS Regression Results                            
Dep. Variable:          citibike_2020   R-squared:                       0.535
Model:                            OLS   Adj. R-squared:                  0.490
Method:                 Least Squares   F-statistic:                     11.87
Date:                Mon, 22 Apr 2024   Prob (F-statistic):           2.42e-05
Time:                        12:47:57   Log-Likelihood:                -173.56
No. Observations:                  35   AIC:                             355.1
Df Residuals:                      31   BIC:                             361.3
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept             25.8383     32

In [11]:
data['ln_distance_km'] = np.log(data['distance_km'])

In [12]:
## 2019
model_6 = smf.ols(formula=' citibike_2019 ~number_of_stations + ln_distance_km ', data=data)
result_6 = model_6.fit()
print(result_6.summary())

                            OLS Regression Results                            
Dep. Variable:          citibike_2019   R-squared:                       0.519
Model:                            OLS   Adj. R-squared:                  0.489
Method:                 Least Squares   F-statistic:                     17.26
Date:                Mon, 22 Apr 2024   Prob (F-statistic):           8.23e-06
Time:                        12:47:57   Log-Likelihood:                -189.98
No. Observations:                  35   AIC:                             386.0
Df Residuals:                      32   BIC:                             390.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept             38.9946     48

In [13]:
## 2020
model_5 = smf.ols(formula=' citibike_2020 ~number_of_stations + ln_distance_km ', data=data)
result_5 = model_5.fit()
print(result_5.summary())

                            OLS Regression Results                            
Dep. Variable:          citibike_2020   R-squared:                       0.580
Model:                            OLS   Adj. R-squared:                  0.553
Method:                 Least Squares   F-statistic:                     22.06
Date:                Mon, 22 Apr 2024   Prob (F-statistic):           9.53e-07
Time:                        12:47:57   Log-Likelihood:                -171.78
No. Observations:                  35   AIC:                             349.6
Df Residuals:                      32   BIC:                             354.2
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept             44.7288     29

In [16]:
#2019 with new station
model_7 = smf.ols(formula=' citibike_2019 ~station_all + distance_km ', data=data)
result_7 = model_7.fit()
print(result_7.summary())

                            OLS Regression Results                            
Dep. Variable:          citibike_2019   R-squared:                       0.365
Model:                            OLS   Adj. R-squared:                  0.326
Method:                 Least Squares   F-statistic:                     9.215
Date:                Mon, 22 Apr 2024   Prob (F-statistic):           0.000691
Time:                        12:50:58   Log-Likelihood:                -194.83
No. Observations:                  35   AIC:                             395.7
Df Residuals:                      32   BIC:                             400.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept     107.0631     32.370      3.307      

In [19]:
#2020 with new station
model_8 = smf.ols(formula=' citibike_2020 ~station_all + distance_km ', data=data)
result_8 = model_8.fit()
print(result_8.summary())

                            OLS Regression Results                            
Dep. Variable:          citibike_2020   R-squared:                       0.441
Model:                            OLS   Adj. R-squared:                  0.406
Method:                 Least Squares   F-statistic:                     12.61
Date:                Mon, 22 Apr 2024   Prob (F-statistic):           9.18e-05
Time:                        12:52:04   Log-Likelihood:                -176.78
No. Observations:                  35   AIC:                             359.6
Df Residuals:                      32   BIC:                             364.2
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      76.8826     19.326      3.978      

In [6]:
##2024/04/26
model_9 = smf.ols(formula=' citibike_2019 ~station_all + distance_km + Income ', data=data)
result_9 = model_9.fit()
print(result_9.summary())

                            OLS Regression Results                            
Dep. Variable:          citibike_2019   R-squared:                       0.398
Model:                            OLS   Adj. R-squared:                  0.340
Method:                 Least Squares   F-statistic:                     6.831
Date:                Fri, 26 Apr 2024   Prob (F-statistic):            0.00115
Time:                        22:00:10   Log-Likelihood:                -234.18
No. Observations:                  35   AIC:                             476.4
Df Residuals:                      31   BIC:                             482.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept     182.0936    153.686      1.185      

In [7]:
##2024/04/26
model_10 = smf.ols(formula=' citibike_2020 ~station_all + distance_km + Income ', data=data)
result_10 = model_10.fit()
print(result_10.summary())

                            OLS Regression Results                            
Dep. Variable:          citibike_2020   R-squared:                       0.617
Model:                            OLS   Adj. R-squared:                  0.580
Method:                 Least Squares   F-statistic:                     16.64
Date:                Fri, 26 Apr 2024   Prob (F-statistic):           1.27e-06
Time:                        22:00:17   Log-Likelihood:                -202.79
No. Observations:                  35   AIC:                             413.6
Df Residuals:                      31   BIC:                             419.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept     155.1654     62.670      2.476      