In [74]:
# importing libraries
import pandas as pd
from sklearn import linear_model
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm
from scipy import stats

In [75]:
# Loading the train and test data
train_df = pd.read_csv('../data/BikeRentalDaily_train.csv', delimiter=';')
test_df = pd.read_csv('../data/BikeRentalDaily_test.csv', delimiter=';')

In [76]:
# Minimal Preprocessing steps
# removing the 'dteday' feature
train_df = train_df.drop('dteday', axis=1).drop('instant', axis=1).drop('casual', axis=1).drop('registered', axis=1)
test_df = test_df.drop('dteday', axis=1).drop('instant', axis=1).drop('casual', axis=1).drop('registered', axis=1)

In [77]:
#calculate correlations between all features of your data frame (see Exploration lab)
train_corr = train_df.corr()
print(train_corr)

# Displaying correlations as a heatmap / correlation matrix 
train_corr.style.background_gradient(cmap='coolwarm')

                   season        yr      mnth   holiday   weekday  workingday  \
season           1.000000 -0.030884  0.801076 -0.013694 -0.050261   -0.005155   
yr              -0.030884  1.000000 -0.028635  0.020713  0.058332    0.002573   
mnth             0.801076 -0.028635  1.000000  0.035361 -0.033016   -0.023682   
holiday         -0.013694  0.020713  0.035361  1.000000 -0.130308   -0.256362   
weekday         -0.050261  0.058332 -0.033016 -0.130308  1.000000    0.039966   
workingday      -0.005155  0.002573 -0.023682 -0.256362  0.039966    1.000000   
weathersit       0.026903 -0.041193  0.069314 -0.022176 -0.001078    0.073316   
temp             0.349176  0.034556  0.207133 -0.024204  0.013061    0.047687   
atemp            0.353616  0.033477  0.212783 -0.028545  0.004824    0.045879   
hum              0.222561 -0.114149  0.246345 -0.027229 -0.057230    0.034867   
windspeed       -0.165162  0.050415 -0.162857  0.019939 -0.057124    0.024044   
leaflets        -0.045326 -0

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,leaflets,price reduction,cnt
season,1.0,-0.030884,0.801076,-0.013694,-0.050261,-0.005155,0.026903,0.349176,0.353616,0.222561,-0.165162,-0.045326,-0.015886,0.211559
yr,-0.030884,1.0,-0.028635,0.020713,0.058332,0.002573,-0.041193,0.034556,0.033477,-0.114149,0.050415,-0.075728,-0.006535,0.35805
mnth,0.801076,-0.028635,1.0,0.035361,-0.033016,-0.023682,0.069314,0.207133,0.212783,0.246345,-0.162857,-0.072537,-0.03087,0.140144
holiday,-0.013694,0.020713,0.035361,1.0,-0.130308,-0.256362,-0.022176,-0.024204,-0.028545,-0.027229,0.019939,-0.048536,-0.046515,-0.048029
weekday,-0.050261,0.058332,-0.033016,-0.130308,1.0,0.039966,-0.001078,0.013061,0.004824,-0.05723,-0.057124,0.06699,-0.059647,0.129789
workingday,-0.005155,0.002573,-0.023682,-0.256362,0.039966,1.0,0.073316,0.047687,0.045879,0.034867,0.024044,0.001663,0.038023,-0.040338
weathersit,0.026903,-0.041193,0.069314,-0.022176,-0.001078,0.073316,1.0,-0.096125,-0.096554,0.634391,0.077404,-0.027512,0.041774,-0.184529
temp,0.349176,0.034556,0.207133,-0.024204,0.013061,0.047687,-0.096125,1.0,0.990357,0.140084,-0.177135,0.014549,-0.019561,0.380473
atemp,0.353616,0.033477,0.212783,-0.028545,0.004824,0.045879,-0.096554,0.990357,1.0,0.151166,-0.192152,0.011986,-0.015664,0.383553
hum,0.222561,-0.114149,0.246345,-0.027229,-0.05723,0.034867,0.634391,0.140084,0.151166,1.0,-0.111199,-0.048554,-0.030044,-0.078469


In [78]:
#calculate correlations of features to label
train_df_corr_label = train_df.iloc[:, :].corr()["cnt"]
train_df_corr_label

season             0.211559
yr                 0.358050
mnth               0.140144
holiday           -0.048029
weekday            0.129789
workingday        -0.040338
weathersit        -0.184529
temp               0.380473
atemp              0.383553
hum               -0.078469
windspeed         -0.100107
leaflets           0.031955
price reduction    0.008591
cnt                1.000000
Name: cnt, dtype: float64

In [79]:
train_df_1 = train_df[train_df_corr_label.abs().sort_values(ascending=False).index[:7]]
train_df_1

Unnamed: 0,cnt,atemp,temp,yr,season,weathersit,mnth
0,5312,0.587133,24.8000,0,2.0,1,6
1,5445,0.324492,12.8667,1,4.0,2,11
2,2236,0.126275,6.0000,1,1.0,1,1
3,6370,0.614925,26.5667,1,2.0,1,4
4,7836,0.505046,20.5667,1,1.0,2,3
...,...,...,...,...,...,...,...
595,3117,0.466525,18.9000,0,1.0,1,3
596,2703,0.440642,17.6667,0,,1,3
597,4097,0.381938,15.3000,1,1.0,2,1
598,3894,0.472846,19.3667,0,4.0,2,10


In [80]:
#calculate correlations between all features of your data frame (see Exploration lab)
test_corr = test_df.corr()
print(train_corr)

# Displaying correlations as a heatmap / correlation matrix 
test_corr.style.background_gradient(cmap='coolwarm')

                   season        yr      mnth   holiday   weekday  workingday  \
season           1.000000 -0.030884  0.801076 -0.013694 -0.050261   -0.005155   
yr              -0.030884  1.000000 -0.028635  0.020713  0.058332    0.002573   
mnth             0.801076 -0.028635  1.000000  0.035361 -0.033016   -0.023682   
holiday         -0.013694  0.020713  0.035361  1.000000 -0.130308   -0.256362   
weekday         -0.050261  0.058332 -0.033016 -0.130308  1.000000    0.039966   
workingday      -0.005155  0.002573 -0.023682 -0.256362  0.039966    1.000000   
weathersit       0.026903 -0.041193  0.069314 -0.022176 -0.001078    0.073316   
temp             0.349176  0.034556  0.207133 -0.024204  0.013061    0.047687   
atemp            0.353616  0.033477  0.212783 -0.028545  0.004824    0.045879   
hum              0.222561 -0.114149  0.246345 -0.027229 -0.057230    0.034867   
windspeed       -0.165162  0.050415 -0.162857  0.019939 -0.057124    0.024044   
leaflets        -0.045326 -0

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,leaflets,price reduction,cnt
season,1.0,0.091982,0.889156,-0.045548,0.030311,0.133775,-0.024979,0.379806,0.396954,0.186322,-0.159082,0.000453,0.098297,0.38994
yr,0.091982,1.0,0.108755,-0.05548,-0.050729,-0.030204,-0.088395,0.102669,0.098564,-0.049687,0.032343,-0.04992,0.067791,0.454139
mnth,0.889156,0.108755,1.0,-0.061999,-0.018742,0.077434,-0.057673,0.27378,0.286962,0.191686,-0.068427,0.011789,0.078755,0.298596
holiday,-0.045548,-0.05548,-0.061999,1.0,-0.018304,-0.235491,-0.10181,-0.04976,-0.051267,0.069668,-0.043476,-0.078229,0.068199,-0.047333
weekday,0.030311,-0.050729,-0.018742,-0.018304,1.0,0.085463,0.070126,0.106106,0.107546,-0.009734,0.050285,0.182021,0.085252,0.148657
workingday,0.133775,-0.030204,0.077434,-0.235491,0.085463,1.0,0.014253,0.073685,0.078391,-0.056504,-0.124132,-0.062136,0.155941,-0.00743
weathersit,-0.024979,-0.088395,-0.057673,-0.10181,0.070126,0.014253,1.0,-0.220772,-0.222938,0.421146,-0.017105,0.094081,-0.11577,-0.226241
temp,0.379806,0.102669,0.27378,-0.04976,0.106106,0.073685,-0.220772,1.0,0.997057,0.073308,-0.052187,-0.024265,0.155577,0.40431
atemp,0.396954,0.098564,0.286962,-0.051267,0.107546,0.078391,-0.222938,0.997057,1.0,0.090191,-0.071875,-0.024323,0.162372,0.408662
hum,0.186322,-0.049687,0.191686,0.069668,-0.009734,-0.056504,0.421146,0.073308,0.090191,1.0,-0.25634,0.066082,0.019044,-0.045908


In [81]:
#calculate correlations of features to label
test_df_corr_label = train_df.iloc[:, :].corr()["cnt"]
test_df_corr_label

season             0.211559
yr                 0.358050
mnth               0.140144
holiday           -0.048029
weekday            0.129789
workingday        -0.040338
weathersit        -0.184529
temp               0.380473
atemp              0.383553
hum               -0.078469
windspeed         -0.100107
leaflets           0.031955
price reduction    0.008591
cnt                1.000000
Name: cnt, dtype: float64

In [82]:
test_df_1 = test_df[test_df_corr_label.abs().sort_values(ascending=False).index[:7]]
test_df_1

Unnamed: 0,cnt,atemp,temp,yr,season,weathersit,mnth
0,3894,0.472846,19.3667,0,4.0,2,10
1,5936,0.427513,17.3565,1,2.0,1,4
2,5629,0.326383,13.0000,1,4.0,1,11
3,3310,0.270196,9.5333,0,4.0,1,12
4,4748,0.522721,21.3000,0,4.0,2,10
...,...,...,...,...,...,...,...
127,4990,0.475371,19.4333,1,1.0,1,3
128,5687,0.687508,30.6000,1,3.0,1,6
129,3907,0.575158,25.4000,0,4.0,2,9
130,5225,0.654688,29.1333,0,3.0,1,6


In [83]:
# Replacing all MissingValues with the most frequent values
imp_most_freq = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

train_df_1[:] = imp_most_freq.fit_transform(train_df_1)
test_df_1[:] = imp_most_freq.fit_transform(test_df_1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_1[:] = imp_most_freq.fit_transform(train_df_1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_1[:] = imp_most_freq.fit_transform(test_df_1)


In [84]:
# train test aplit

train_features = train_df_1.iloc[:, 1:]
train_labels = train_df_1.iloc[:, :1]

test_features = test_df_1.iloc[:, 1:]
test_labels = test_df_1.iloc[:, :1]

In [85]:
# Model building
reg_model_before_pp = linear_model.LinearRegression()

reg_model_before_pp.fit(train_features, train_labels)

print(reg_model_before_pp.coef_)

[[ 7.06848983e+03  3.17670316e+00  2.47496949e+03  2.96340462e+02
  -9.52774594e+02  2.82119368e+01]]


In [86]:
# Predict all test examples
ypred = reg_model_before_pp.predict(test_features)
ypred

array([[3505.50991795],
       [5844.47980062],
       [5905.96859876],
       [3051.04112903],
       [3864.1923684 ],
       [1277.86527256],
       [7977.17576614],
       [6261.89801325],
       [5484.15717911],
       [6050.2110409 ],
       [5159.69517143],
       [7606.29537727],
       [2118.70746289],
       [8442.02141818],
       [7010.26207478],
       [7984.69146087],
       [7949.3907678 ],
       [3179.51729579],
       [5443.46898288],
       [8210.12181981],
       [6285.20848893],
       [4327.74363122],
       [4389.75049337],
       [1304.54368466],
       [5290.79222507],
       [5499.56985529],
       [4530.94308369],
       [5031.20902191],
       [7317.77267668],
       [5566.95652338],
       [6865.13619596],
       [8083.13749153],
       [7417.62386439],
       [5939.11378247],
       [1263.76311936],
       [5696.82163705],
       [2430.01832627],
       [2272.93659979],
       [2877.0361149 ],
       [4949.04912879],
       [5171.12732183],
       [1220.823

In [97]:
print(f"Predict label for following example:\n{test_features.iloc[0,:]}")

#Predict a single value
predicted_value=reg_model_before_pp.predict([test_features.iloc[0,:]])[0]

print("\nPredicted label: ",predicted_value)
print("Actual label:", test_labels.iloc[0])
print("Deviation predicted from actual value: ",predicted_value-test_labels.iloc[0])

Predict label for following example:
atemp          0.472846
temp          19.366700
yr             0.000000
season         4.000000
weathersit     2.000000
mnth          10.000000
Name: 0, dtype: float64

Predicted label:  [3505.50991795]
Actual label: cnt    3894
Name: 0, dtype: int64
Deviation predicted from actual value:  cnt   -388.490082
Name: 0, dtype: float64




In [101]:
print(round(ypred[:10]-test_labels.iloc[0:10])*100000, 2)

           cnt
0  -38800000.0
1   -9200000.0
2   27700000.0
3  -25900000.0
4  -88400000.0
5  -17200000.0
6   93700000.0
7  -51600000.0
8   36500000.0
9  171800000.0 2


In [89]:
# Evaluate model

#Return Mean Absolute Error
mae = mean_absolute_error(test_labels, ypred)
print('MAE: %.3f' % mae)

#Return the coefficient of determination R^2 of the prediction.
print("R^2 value of the model: ",reg_model_before_pp.score(test_features, test_labels))

MAE: 1035.306
R^2 value of the model:  0.367551008576638


In [90]:
#Check p-values using statsmodels
X = train_features
y = train_labels

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                    cnt   R-squared:                       0.296
Model:                            OLS   Adj. R-squared:                  0.289
Method:                 Least Squares   F-statistic:                     41.62
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           2.21e-42
Time:                        16:22:47   Log-Likelihood:                -5656.8
No. Observations:                 600   AIC:                         1.133e+04
Df Residuals:                     593   BIC:                         1.136e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        539.7485    577.795      0.934      0.3