In [1]:
# import packages
import pandas as pd
from datetime import datetime
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (20, 8)
plt.rcParams['axes.linewidth'] = 2



In [2]:
# import the data
bikes = pd.read_csv('train.csv', index_col=0, parse_dates=True)
bikes.head(2)

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [3]:
# In order to include the correlations from timestamp I will introduce new columns:
bikes['week'] = bikes.index.week
bikes['month'] = bikes.index.month
bikes['day'] = bikes.index.day
bikes['hour'] = bikes.index.hour
bikes['year'] = bikes.index.year
bikes['monthPERyear'] = bikes.index.month + (bikes['year']-2011)*12
bikes.head(2)
#bikes['day'].unique()

  


Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,week,month,day,hour,year,monthPERyear
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,52,1,1,0,2011,1
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,52,1,1,1,2011,1


In [4]:
bikes['countLog'] = np.log10(bikes['count'])

In [5]:
df = bikes.drop(columns=['registered','casual','temp','year','month','count','humidity','season'])

In [6]:
df.head(1)

Unnamed: 0_level_0,holiday,workingday,weather,atemp,windspeed,week,day,hour,monthPERyear,countLog
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2011-01-01,0,0,1,14.395,0.0,52,1,0,1,1.20412


In [7]:
# Change the only value weather =4 to 3 which is close!!
df.loc[df['weather'] == 4, 'weather'] = 3

# $\color{yellow}{\text{Exploratory Data Analysis + Feature Engineering }}$

- $\color{red}{\text{Import packages}}$

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import Lasso 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
import pandas as pd
import os
print(os.getcwd())

/Users/alexandros.samartzis/Spiced_Academy/stationary-sriracha-student-code/week_3/spiced_projects


- $\color{blue}{\text{Separate notebooks were created for the exploration of the data. }}$


- $\color{red}{\text{Split Data }}$


In [9]:
# create X and y
X = pd.DataFrame(df)
X.drop(columns = ['countLog'],inplace=True)
y = df['countLog']

print(X.shape)
print(y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33,random_state=42)

(10886, 9)
(10886,)


- $\color{red}{\text{Define the pipelines }}$


In [10]:
pipe13 = Pipeline([
    ('polynomial_13', PolynomialFeatures(degree= 13, include_bias= True, interaction_only=False)),
])
pipe6 = Pipeline([
    ('polynomial_6', PolynomialFeatures(degree= 6, include_bias= True, interaction_only=False)),
])


In [11]:

pipe1s = Pipeline([
    ('num_scaler',MinMaxScaler()),
    ('polynomial', PolynomialFeatures(degree= 1, include_bias= True, interaction_only=False))
])
pipe2s = Pipeline([
    ('num_scaler',MinMaxScaler()),
    ('polynomial', PolynomialFeatures(degree= 2, include_bias= True, interaction_only=False))
])

In [12]:
OHE_pipeline = Pipeline([
    ('OHE_pipe',OneHotEncoder(sparse=False, drop='first',handle_unknown='ignore')),
])

In [13]:
column_transformer = ColumnTransformer([
    ('OHE_pipe',OHE_pipeline,['day','week','weather' ]), 
    ('poly_hour', pipe13, ["hour"]),    
    ('poly_monthPERyear', pipe6, ["monthPERyear"]), 
    ('atemp_pipe', pipe1s, ["atemp"]), 
    ('wind_pipe', pipe2s, ["windspeed"]), 
])

In [14]:
X_train_no_int = column_transformer.fit_transform(X_train) 
X_test_no_int = column_transformer.transform(X_test)

In [15]:
df_no_int=pd.DataFrame(X_train_no_int, columns=column_transformer.get_feature_names_out())
df_no_int.columns

Index(['OHE_pipe__day_2', 'OHE_pipe__day_3', 'OHE_pipe__day_4',
       'OHE_pipe__day_5', 'OHE_pipe__day_6', 'OHE_pipe__day_7',
       'OHE_pipe__day_8', 'OHE_pipe__day_9', 'OHE_pipe__day_10',
       'OHE_pipe__day_11', 'OHE_pipe__day_12', 'OHE_pipe__day_13',
       'OHE_pipe__day_14', 'OHE_pipe__day_15', 'OHE_pipe__day_16',
       'OHE_pipe__day_17', 'OHE_pipe__day_18', 'OHE_pipe__day_19',
       'OHE_pipe__week_2', 'OHE_pipe__week_3', 'OHE_pipe__week_5',
       'OHE_pipe__week_6', 'OHE_pipe__week_7', 'OHE_pipe__week_9',
       'OHE_pipe__week_10', 'OHE_pipe__week_11', 'OHE_pipe__week_12',
       'OHE_pipe__week_13', 'OHE_pipe__week_14', 'OHE_pipe__week_15',
       'OHE_pipe__week_16', 'OHE_pipe__week_17', 'OHE_pipe__week_18',
       'OHE_pipe__week_19', 'OHE_pipe__week_20', 'OHE_pipe__week_22',
       'OHE_pipe__week_23', 'OHE_pipe__week_24', 'OHE_pipe__week_25',
       'OHE_pipe__week_26', 'OHE_pipe__week_27', 'OHE_pipe__week_28',
       'OHE_pipe__week_29', 'OHE_pipe__week_31', 'OH

- $\color{red}{\text{Use Linear Regression as model }}$


In [16]:
m = LinearRegression()
m.fit(X_train_no_int,y_train)
m.score(X_train_no_int,y_train)
f' Train scores: LinReg {round(m.score(X_train_no_int,y_train),3)}'

' Train scores: LinReg 0.708'

In [17]:
m.score(X_test_no_int,y_test)
f' Test scores: LinReg {round(m.score(X_test_no_int,y_test),3)}'

' Test scores: LinReg 0.698'

In [18]:
y_pred = m.predict(X_test_no_int)

In [19]:
ypred_exp = np.exp(y_pred)

- $\color{red}{\text{Cross-validation}}$


In [20]:
from sklearn.model_selection import cross_val_score  #Run cross-validation for single metric evaluation.

- - $\color{blue}{\text{Cross-validation for Train data}}$

In [21]:
cross_r2_lin = cross_val_score(m,   # estimator: # the model you want to evaluate 
                      X_train_no_int,         # the training input data 
                      y_train,         # the training output data  
                      cv=5,          # number of cross validation datasets, k-folds 
                      scoring='r2'
     )     # evaluation metric 

In [22]:
print(cross_r2_lin)
print(f' Mean Cross validation score of train data: LinReg {round(cross_r2_lin.mean(),3)}')

[0.68939739 0.72627404 0.67616337 0.72805105 0.70501467]
 Mean Cross validation score of train data: LinReg 0.705


- - $\color{blue}{\text{Cross-validation for Test data}}$

In [23]:
cross_r2_lin = cross_val_score(m,   # estimator: # the model you want to evaluate 
                      X_test_no_int,         # the training input data 
                      y_test,         # the training output data  
                      cv=5,          # number of cross validation datasets, k-folds 
                      scoring='r2'
     )     # evaluation metric 

In [24]:
print(cross_r2_lin)
print(f' Mean Cross validation score of train data: LinReg {round(cross_r2_lin.mean(),3)}')

[0.70335086 0.70800427 0.67487045 0.67957276 0.69088601]
 Mean Cross validation score of train data: LinReg 0.691


# -  $\color{red}{\text{Kaggle}}$

In [25]:
df_k = pd.read_csv('test.csv', index_col=0, parse_dates=True)
df_k.shape

(6493, 8)

In [26]:
df_k.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


# FE for test data 

In [27]:
df_k['week'] = df_k.index.week
df_k['month'] = df_k.index.month
df_k['day'] = df_k.index.day
df_k['hour'] = df_k.index.hour
df_k['year'] = df_k.index.year
df_k['monthPERyear'] = df_k.index.month + (df_k['year']-2011)*12
df_k.head(2)

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,week,month,day,hour,year,monthPERyear
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,3,1,20,0,2011,1
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,3,1,20,1,2011,1


In [28]:
dfk = df_k.drop(columns=['temp','year','month','humidity','season'])
dfk.loc[dfk['weather'] == 4, 'weather'] = 3

In [31]:
X_kag = column_transformer.transform(dfk)
kag_pred = m.predict(X_kag)
y_Kaggle=np.exp(kag_pred)



In [32]:
final = pd.read_csv('sampleSubmission.csv', index_col=0, parse_dates=True)
final['count'] = y_Kaggle
pd.DataFrame(final).to_csv("bikes_predictions")