In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv(
    'https://raw.githubusercontent.com/MicrosoftDocs/ml-basics/master/data/daily-bike-share.csv')

In [2]:
data.dtypes

instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
rentals         int64
dtype: object

In [3]:
data.isnull().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
rentals       0
dtype: int64

In [4]:
data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,rentals
0,1,1/1/2011,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331
1,2,1/2/2011,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131
2,3,1/3/2011,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120
3,4,1/4/2011,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108
4,5,1/5/2011,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82


In [5]:
data = data[['season', 'mnth', 'holiday', 'weekday', 'workingday',
             'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'rentals']]

In [6]:
data.head()

Unnamed: 0,season,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,rentals
0,1,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331
1,1,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131
2,1,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120
3,1,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108
4,1,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82


In [7]:
# Split the dataset
from sklearn.model_selection import train_test_split
X = data.drop('rentals', axis=1)
y = data['rentals']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [8]:
# Lets create the pipeline

In [9]:
# Lets import some packages
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [10]:
# Transformation for numerical columns and categorical columns will be different!

In [11]:
# Lets define the Pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('encoder', OrdinalEncoder())
])

In [12]:
# we need to specify which columns are numerical and which ones are categorical
# apply the transformers to the features using column transformer

numeric_features = ['temp', 'atemp', 'hum', 'windspeed']
categorical_features = ['season', 'mnth', 'holiday',
                        'weekday', 'workingday', 'weathersit']

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ]
)

#### Estimator

In [13]:
from sklearn.ensemble import RandomForestRegressor

# This is our overall model with preprocessing encapsulated within the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [14]:
# lets train the model
rf_model = pipeline.fit(X_train,y_train)
rf_model

In [15]:
# Lets evaluate
from sklearn.metrics import r2_score

predictions = rf_model.predict(X_test)

r2_score(y_test,predictions) #(actual,predicted)

0.772519515099845

In [16]:
# Saving the model for reproducibility
import joblib
joblib.dump(rf_model, './rf_model.pkl')

['./rf_model.pkl']

In [17]:
# # In other notebooks 
# rf_model = joblib.load('PATH/TO/rf_model.pkl')
# new_prediction = rf_model.predict(new_data)