In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Time Series

## Live Demos

In [3]:
bike_sharing_data = pd.read_csv("data/bike-sharing-demand/train.csv")

In [4]:
bike_sharing_data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129


In [5]:
bike_sharing_data.dtypes

datetime       object
season          int64
holiday         int64
workingday      int64
weather         int64
temp          float64
atemp         float64
humidity        int64
windspeed     float64
casual          int64
registered      int64
count           int64
dtype: object

In [6]:
bike_sharing_data.datetime = pd.to_datetime(bike_sharing_data.datetime)

In [7]:
bike_sharing_data = bike_sharing_data.set_index("datetime")

In [8]:
categorical_feature_names = ["season", "holiday", "workingday", "weather"]
categorical_features = pd.get_dummies(bike_sharing_data[categorical_feature_names].astype(str), drop_first = True)
bike_sharing_data = bike_sharing_data.merge(categorical_features, left_index = True, right_index = True)
bike_sharing_data = bike_sharing_data.drop(columns = categorical_feature_names)

In [9]:
bike_sharing_data

Unnamed: 0_level_0,temp,atemp,humidity,windspeed,casual,registered,count,season_2,season_3,season_4,holiday_1,workingday_1,weather_2,weather_3,weather_4
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2011-01-01 00:00:00,9.84,14.395,81,0.0000,3,13,16,0,0,0,0,0,0,0,0
2011-01-01 01:00:00,9.02,13.635,80,0.0000,8,32,40,0,0,0,0,0,0,0,0
2011-01-01 02:00:00,9.02,13.635,80,0.0000,5,27,32,0,0,0,0,0,0,0,0
2011-01-01 03:00:00,9.84,14.395,75,0.0000,3,10,13,0,0,0,0,0,0,0,0
2011-01-01 04:00:00,9.84,14.395,75,0.0000,0,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-19 19:00:00,15.58,19.695,50,26.0027,7,329,336,0,0,1,0,1,0,0,0
2012-12-19 20:00:00,14.76,17.425,57,15.0013,10,231,241,0,0,1,0,1,0,0,0
2012-12-19 21:00:00,13.94,15.910,61,15.0013,4,164,168,0,0,1,0,1,0,0,0
2012-12-19 22:00:00,13.94,17.425,61,6.0032,12,117,129,0,0,1,0,1,0,0,0


In [10]:
bike_sharing_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
temp,10886.0,20.23086,7.79159,0.82,13.94,20.5,26.24,41.0
atemp,10886.0,23.655084,8.474601,0.76,16.665,24.24,31.06,45.455
humidity,10886.0,61.88646,19.245033,0.0,47.0,62.0,77.0,100.0
windspeed,10886.0,12.799395,8.164537,0.0,7.0015,12.998,16.9979,56.9969
casual,10886.0,36.021955,49.960477,0.0,4.0,17.0,49.0,367.0
registered,10886.0,155.552177,151.039033,0.0,36.0,118.0,222.0,886.0
count,10886.0,191.574132,181.144454,1.0,42.0,145.0,284.0,977.0
season_2,10886.0,0.251056,0.433641,0.0,0.0,0.0,1.0,1.0
season_3,10886.0,0.251056,0.433641,0.0,0.0,0.0,1.0,1.0
season_4,10886.0,0.251148,0.433694,0.0,0.0,0.0,1.0,1.0


In [11]:
bike_sharing_attributes, bike_sharing_target = bike_sharing_data.drop(columns = ["casual", "registered", "count"]), bike_sharing_data["count"]

In [12]:
model = LinearRegression()

In [13]:
model.fit(bike_sharing_attributes, bike_sharing_target)

In [14]:
model.score(bike_sharing_attributes, bike_sharing_target)

0.2767549620228168

In [15]:
predictions = model.predict(bike_sharing_attributes)

In [16]:
predictions

array([ 13.10928117,   7.16917332,   7.16917332, ..., 178.11332088,
       176.95943452, 155.94285748])

In [17]:
np.sqrt(mean_squared_error(bike_sharing_target, predictions))

154.04507737677713