<a href="https://colab.research.google.com/github/Otobi1/Bike-Demand-Prediction/blob/master/Bike_Demand_Prediction_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

In [2]:
!wget http://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv

--2021-03-29 17:53:27--  http://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 604166 (590K) [application/x-httpd-php]
Saving to: ‘SeoulBikeData.csv.2’


2021-03-29 17:53:28 (9.37 MB/s) - ‘SeoulBikeData.csv.2’ saved [604166/604166]



In [3]:
df = pd.read_csv('./SeoulBikeData.csv', index_col='Date', encoding='unicode_escape')

data = df.copy()

In [4]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data['Seasons']):
  strat_train_set = data.iloc[train_index]
  strat_test_set = data.iloc[test_index]

In [5]:
strat_train_set.shape

(7008, 13)

In [6]:
strat_test_set.shape

(1752, 13)

In [7]:
data = strat_train_set.drop('Rented Bike Count', axis=1)
data_labels = strat_train_set['Rented Bike Count'].copy()
data

Unnamed: 0_level_0,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
29/06/2018,9,22.9,86,1.7,538,20.4,0.76,0.0,0.0,Summer,No Holiday,Yes
13/01/2018,18,-2.6,73,1.0,684,-6.7,0.01,0.0,0.0,Winter,No Holiday,Yes
26/11/2018,22,6.2,70,0.4,474,1.1,0.00,0.0,0.0,Autumn,No Holiday,Yes
05/06/2018,8,21.5,58,1.1,1021,12.8,1.21,0.0,0.0,Summer,No Holiday,Yes
01/06/2018,21,23.5,44,1.6,2000,10.5,0.00,0.0,0.0,Summer,No Holiday,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...
30/11/2018,3,-0.1,69,0.9,1338,-5.1,0.00,0.0,0.0,Autumn,No Holiday,Yes
21/06/2018,13,27.3,33,3.4,1921,9.6,3.42,0.0,0.0,Summer,No Holiday,Yes
22/02/2018,4,-2.4,58,0.3,2000,-9.5,0.00,0.0,0.0,Winter,No Holiday,Yes
30/11/2018,20,3.4,37,2.3,2000,-9.9,0.00,0.0,0.0,Autumn,No Holiday,Yes


In [8]:
def num_pipeline_transformer(data):
  """
  Function to process numerical transformations 
  Argument:
    data: original dataframe
  Returns:
    num_attrs: numerical dataframe
    num_pipeline: numerical pipeline object
  """
  numerics = ['float64', 'int64']

  num_attrs = data.select_dtypes(include=numerics)

  num_pipeline = Pipeline([
     ('std_scaler', StandardScaler()), 
      ])
  return num_attrs, num_pipeline

def pipeline_transformer(data):
  """
  Complete transformation pipeline for both
  numerical and categorical data.

  Argument:
    data:original dataframe
  Returns:
    prepared_data: transformed, ready to use
  """

  cat_attrs = ['Seasons', 'Holiday', 'Functioning Day']
  num_attrs, num_pipeline = num_pipeline_transformer(data)
  full_pipeline = ColumnTransformer([
       ('num', num_pipeline, list(num_attrs)),
       ('cat', OneHotEncoder(), cat_attrs),
       ])
  prepared_data = full_pipeline.fit_transform(data)
  return prepared_data


In [9]:
prepared_data = pipeline_transformer(data)
prepared_data

array([[-0.35968275,  0.83664277,  1.36677375, ...,  1.        ,
         0.        ,  1.        ],
       [ 0.93947602, -1.29688457,  0.72763035, ...,  1.        ,
         0.        ,  1.        ],
       [ 1.51687991, -0.56060847,  0.58013572, ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [-1.08143762, -1.28015103, -0.0098428 , ...,  1.        ,
         0.        ,  1.        ],
       [ 1.22817796, -0.79487814, -1.04230521, ...,  1.        ,
         0.        ,  1.        ],
       [-1.08143762, -0.20920397,  0.13765183, ...,  1.        ,
         0.        ,  1.        ]])

In [10]:
prepared_data[0]

array([-0.35968275,  0.83664277,  1.36677375, -0.02323267, -1.4890478 ,
        1.24640775,  0.22153702, -0.13145357, -0.1724833 ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  1.        ])

In [11]:
prepared_data.shape

(7008, 17)

In [12]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
sample_data = data.iloc[:5]
sample_labels = data_labels.iloc[:5]

sample_data_prepared = pipeline_transformer(sample_data)

print ("Prediction of samples:", lin_reg.predict(sample_data_prepared))