In [0]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [2]:
df_hour = pd.read_csv('hour.csv')

print(df_hour.columns)
print('\n')

# feature selection
df_hour_X = df_hour[['season', 'mnth', 'hr', 'workingday', 'weathersit', 'temp', 'hum', 'windspeed']]
df_hour_y = df_hour['cnt']

print(df_hour_X.head())
print('\n')

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt'],
      dtype='object')


   season  mnth  hr  workingday  weathersit  temp   hum  windspeed
0       1     1   0           0           1  0.24  0.81        0.0
1       1     1   1           0           1  0.22  0.80        0.0
2       1     1   2           0           1  0.22  0.80        0.0
3       1     1   3           0           1  0.24  0.75        0.0
4       1     1   4           0           1  0.24  0.75        0.0




In [0]:
# splitting hour data in training and testing sets
hour_X_train, hour_X_test, hour_y_train, hour_y_test = train_test_split(df_hour_X, df_hour_y)

In [4]:
# hour dataset - without Kfold cross validation

# Lasso regression
l_reg = linear_model.Lasso(alpha=0.1)
l_reg.fit(hour_X_train, hour_y_train)

hour_l_predictions = l_reg.predict(hour_X_test)
print("Lasso Regression MSE on hour dataset: %f" % mean_squared_error(hour_y_test, hour_l_predictions))

# Ridge regression
r_reg = linear_model.Ridge(alpha=0.5)
r_reg.fit(hour_X_train, hour_y_train)

hour_r_predictions = r_reg.predict(hour_X_test)
print("Ridge Regression MSE on hour dataset: %f" % mean_squared_error(hour_y_test, hour_r_predictions))
print("Ridge coefficients: ", r_reg.coef_)
print('\n')

Lasso Regression MSE on hour dataset: 22606.560105
Ridge Regression MSE on hour dataset: 22597.456105
ridge coefficients:  [ 1.93280298e+01  3.48284584e-02  7.56623692e+00  5.08977421e+00
  1.15437031e-01  2.96375084e+02 -2.17946784e+02  7.44291574e+00]




In [7]:
# Applying KFold cross validation with K=10
kf = KFold(n_splits=10)

for train, test in kf.split(df_hour_X):
  hour_X_train, hour_X_test = df_hour_X.iloc[train], df_hour_X.iloc[test]
  hour_y_train, hour_y_test = df_hour_y.iloc[train], df_hour_y.iloc[test]
  
  # Lasso regression
  l_reg = linear_model.Lasso(alpha=0.1)
  l_reg.fit(hour_X_train, hour_y_train)

  hour_l_predictions = l_reg.predict(hour_X_test)
  print("Lasso Regression MSE on hour dataset: %f" % mean_squared_error(hour_y_test, hour_l_predictions))

  # Ridge regression
  r_reg = linear_model.Ridge(alpha=0.5)
  r_reg.fit(hour_X_train, hour_y_train)

  hour_r_predictions = r_reg.predict(hour_X_test)
  print("Ridge Regression MSE on hour dataset: %f" % mean_squared_error(hour_y_test, hour_r_predictions))
  print('\n')  

Lasso Regression MSE on hour dataset: 11183.715527
Ridge Regression MSE on hour dataset: 11174.003943


Lasso Regression MSE on hour dataset: 11589.548894
Ridge Regression MSE on hour dataset: 11604.182375


Lasso Regression MSE on hour dataset: 24992.733207
Ridge Regression MSE on hour dataset: 25171.491325


Lasso Regression MSE on hour dataset: 16254.678310
Ridge Regression MSE on hour dataset: 16258.456204


Lasso Regression MSE on hour dataset: 12859.541993
Ridge Regression MSE on hour dataset: 12816.065516


Lasso Regression MSE on hour dataset: 17338.029614
Ridge Regression MSE on hour dataset: 17412.233133


Lasso Regression MSE on hour dataset: 35951.613328
Ridge Regression MSE on hour dataset: 35925.332524


Lasso Regression MSE on hour dataset: 33695.769239
Ridge Regression MSE on hour dataset: 33633.476928


Lasso Regression MSE on hour dataset: 45870.249547
Ridge Regression MSE on hour dataset: 45766.028528


Lasso Regression MSE on hour dataset: 28029.165784
Ridge Regress