# Regression project

## 0. Load packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline

## 1. Read the data

In [2]:
# Read the data
data = pd.read_csv('data/bikeshare.csv')

In [3]:
data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1
5,2011-01-01 05:00:00,1,0,0,2,9.84,12.880,75,6.0032,0,1,1
6,2011-01-01 06:00:00,1,0,0,1,9.02,13.635,80,0.0000,2,0,2
7,2011-01-01 07:00:00,1,0,0,1,8.20,12.880,86,0.0000,1,2,3
8,2011-01-01 08:00:00,1,0,0,1,9.84,14.395,75,0.0000,1,7,8
9,2011-01-01 09:00:00,1,0,0,1,13.12,17.425,76,0.0000,8,6,14


# 2. Data preprocessing

## 2.1. Handle a datetime variable

In [4]:
# Add 'year', 'month' and 'hour'
datetime = pd.DatetimeIndex(data['datetime'])
data['year'] = datetime.year
data['month'] = datetime.month
data['hour'] = datetime.hour

# Drop 'datetime'
data.drop('datetime', axis=1, inplace=True)

In [5]:
data

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,hour
0,1,0,0,1,9.84,14.395,81,0.0000,3,13,16,2011,1,0
1,1,0,0,1,9.02,13.635,80,0.0000,8,32,40,2011,1,1
2,1,0,0,1,9.02,13.635,80,0.0000,5,27,32,2011,1,2
3,1,0,0,1,9.84,14.395,75,0.0000,3,10,13,2011,1,3
4,1,0,0,1,9.84,14.395,75,0.0000,0,1,1,2011,1,4
5,1,0,0,2,9.84,12.880,75,6.0032,0,1,1,2011,1,5
6,1,0,0,1,9.02,13.635,80,0.0000,2,0,2,2011,1,6
7,1,0,0,1,8.20,12.880,86,0.0000,1,2,3,2011,1,7
8,1,0,0,1,9.84,14.395,75,0.0000,1,7,8,2011,1,8
9,1,0,0,1,13.12,17.425,76,0.0000,8,6,14,2011,1,9


## 2.2. Rename the variable 'count'

In [6]:
# "count" is a method, so it's best to name that column something else
data.rename(columns={'count':'total'}, inplace=True)

In [7]:
data

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,total,year,month,hour
0,1,0,0,1,9.84,14.395,81,0.0000,3,13,16,2011,1,0
1,1,0,0,1,9.02,13.635,80,0.0000,8,32,40,2011,1,1
2,1,0,0,1,9.02,13.635,80,0.0000,5,27,32,2011,1,2
3,1,0,0,1,9.84,14.395,75,0.0000,3,10,13,2011,1,3
4,1,0,0,1,9.84,14.395,75,0.0000,0,1,1,2011,1,4
5,1,0,0,2,9.84,12.880,75,6.0032,0,1,1,2011,1,5
6,1,0,0,1,9.02,13.635,80,0.0000,2,0,2,2011,1,6
7,1,0,0,1,8.20,12.880,86,0.0000,1,2,3,2011,1,7
8,1,0,0,1,9.84,14.395,75,0.0000,1,7,8,2011,1,8
9,1,0,0,1,13.12,17.425,76,0.0000,8,6,14,2011,1,9


## 2.3. Drop the variables named 'casual' and 'registered'

In [8]:
data.drop(['casual', 'registered'], axis=1, inplace=True)

In [9]:
data

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,total,year,month,hour
0,1,0,0,1,9.84,14.395,81,0.0000,16,2011,1,0
1,1,0,0,1,9.02,13.635,80,0.0000,40,2011,1,1
2,1,0,0,1,9.02,13.635,80,0.0000,32,2011,1,2
3,1,0,0,1,9.84,14.395,75,0.0000,13,2011,1,3
4,1,0,0,1,9.84,14.395,75,0.0000,1,2011,1,4
5,1,0,0,2,9.84,12.880,75,6.0032,1,2011,1,5
6,1,0,0,1,9.02,13.635,80,0.0000,2,2011,1,6
7,1,0,0,1,8.20,12.880,86,0.0000,3,2011,1,7
8,1,0,0,1,9.84,14.395,75,0.0000,8,2011,1,8
9,1,0,0,1,13.12,17.425,76,0.0000,14,2011,1,9


## 2.4. Add dummy variables

In [10]:
# Handling 'season' variable
season_dummies = pd.get_dummies(data.season, prefix='season')
season_dummies.drop(season_dummies.columns[0], axis=1, inplace=True)
data = pd.concat([data, season_dummies], axis=1)

data.drop('season', axis=1, inplace=True)

In [11]:
# # Handling 'hour' variable
# hour_dummies = pd.get_dummies(data.hour, prefix='hour')
# hour_dummies.drop(hour_dummies.columns[0], axis=1, inplace=True)
# data = pd.concat([data, hour_dummies], axis=1)

# data.drop('hour', axis=1, inplace=True)

In [12]:
data

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,total,year,month,hour,season_2,season_3,season_4
0,0,0,1,9.84,14.395,81,0.0000,16,2011,1,0,0,0,0
1,0,0,1,9.02,13.635,80,0.0000,40,2011,1,1,0,0,0
2,0,0,1,9.02,13.635,80,0.0000,32,2011,1,2,0,0,0
3,0,0,1,9.84,14.395,75,0.0000,13,2011,1,3,0,0,0
4,0,0,1,9.84,14.395,75,0.0000,1,2011,1,4,0,0,0
5,0,0,2,9.84,12.880,75,6.0032,1,2011,1,5,0,0,0
6,0,0,1,9.02,13.635,80,0.0000,2,2011,1,6,0,0,0
7,0,0,1,8.20,12.880,86,0.0000,3,2011,1,7,0,0,0
8,0,0,1,9.84,14.395,75,0.0000,8,2011,1,8,0,0,0
9,0,0,1,13.12,17.425,76,0.0000,14,2011,1,9,0,0,0


In [13]:
data.columns

Index(['holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity',
       'windspeed', 'total', 'year', 'month', 'hour', 'season_2', 'season_3',
       'season_4'],
      dtype='object')

## 2.5. Data preparation

In [14]:
X = data.drop('total', axis=1)
Y = data['total']

In [15]:
X.head(5)

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,hour,season_2,season_3,season_4
0,0,0,1,9.84,14.395,81,0.0,2011,1,0,0,0,0
1,0,0,1,9.02,13.635,80,0.0,2011,1,1,0,0,0
2,0,0,1,9.02,13.635,80,0.0,2011,1,2,0,0,0
3,0,0,1,9.84,14.395,75,0.0,2011,1,3,0,0,0
4,0,0,1,9.84,14.395,75,0.0,2011,1,4,0,0,0


In [16]:
Y.head(5)

0    16
1    40
2    32
3    13
4     1
Name: total, dtype: int64

In [17]:
# Divide data into training set and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)

# 3. Train and validate models
- `sklearn.linear_model.Ridge`: [link](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html)
- `sklearn.linear_model.Lasso`: [link](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)
- `sklearn.linear_model.ElasticNet`: [link](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html#sklearn.linear_model.ElasticNet)
- `sklearn.linear_model.HuberRegressor`: [link](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.HuberRegressor.html#sklearn.linear_model.HuberRegressor)
- `sklearn.linear_model.PassiveAggressiveRegressor`: [link](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveRegressor.html#sklearn.linear_model.PassiveAggressiveRegressor)
- `sklearn.tree.DecisionTreeRegressor`: [link](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor))
- `sklearn.tree.ExtraTreeRegressor`: [link](http://scikit-learn.org/stable/modules/generated/sklearn.tree.ExtraTreeRegressor.html#sklearn.tree.ExtraTreeRegressor)
- `sklearn.ensemble.RandomForestRegressor`: [link](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor)
- `sklearn.neighbors.KNeighborsRegressor`: [link](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html#sklearn.neighbors.KNeighborsRegressor)
- `sklearn.kernel_ridge.KernelRidge`: [link](http://scikit-learn.org/stable/modules/generated/sklearn.kernel_ridge.KernelRidge.html#sklearn.kernel_ridge.KernelRidge)
- `sklearn.gaussian_process.GaussianProcessRegressor`: [link](http://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html#sklearn.gaussian_process.GaussianProcessRegressor)
- `sklearn.neural_network.MLPRegressor`: [link](http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor)

In [18]:
# Import models
from sklearn.linear_model import Ridge, Lasso, ElasticNet, HuberRegressor, PassiveAggressiveRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor

In [19]:
# Pre-allocate models and corresponding parameter candidates
models = []
params = []

In [20]:
model = ('Ridge', Ridge())
param = {
    'alpha': [0.1, 0.3, 0.5, 1.0, 3.0, 5.0, 10.0]
}

models.append(model)
params.append(param)

In [21]:
model = ('Lasso', Lasso())
param = {
    'alpha': [0.1, 0.3, 0.5, 1.0, 3.0, 5.0, 10.0]
}

models.append(model)
params.append(param)

In [22]:
model = ('ElasticNet', ElasticNet())
param = {
    'alpha': [0.1, 0.3, 0.5, 1.0, 3.0, 5.0, 10.0],
    'l1_ratio': [0.3, 0.5, 0.7]
}

models.append(model)
params.append(param)

In [None]:
model = ('HuberReg', HuberRegressor())
param = {
    
}

In [None]:
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('HR', HuberRegressor()))
models.append(('PAR', PassiveAggressiveRegressor()))
models.append(('DT', DecisionTreeRegressor()))
models.append(('ExtraTree', ExtraTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('KernelRidge', KernelRidge()))
models.append(('GP', GaussianProcessRegressor()))
models.append(('MLP', MLPRegressor()))

In [None]:
from pprint import pprint
pprint(models)