In [14]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.covariance import EllipticEnvelope
from sklearn.decomposition import PCA

### Load the data set for Bike Rentals

In [45]:
bike = pd.read_csv('https://github.com/Nishant2415/Machine-Learning/raw/master/Dataset/bike_updated.csv')
bike.head(20)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,1,0,0,1,9.84,14.395,75,0.0,0,1,1
5,1,0,0,2,9.84,12.88,75,6.0032,0,1,1
6,1,0,0,1,9.02,13.635,80,0.0,2,0,2
7,1,0,0,1,8.2,12.88,86,0.0,1,2,3
8,1,0,0,1,9.84,14.395,75,0.0,1,7,8
9,1,0,0,1,13.12,17.425,76,0.0,8,6,14


### For season, convert the categorial data into numeric value using following:season (1:springer, 2:summer, 3:fall, 4:winter)

In [46]:
temp = {"season":{1:'springer',2:'summer',3:'fall',4:'winter'}}
print(temp)
bike.replace(temp,inplace=True)
bike

{'season': {1: 'springer', 2: 'summer', 3: 'fall', 4: 'winter'}}


Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,springer,0,0,1,9.84,14.395,81,0.0000,3,13,16
1,springer,0,0,1,9.02,13.635,80,0.0000,8,32,40
2,springer,0,0,1,9.02,13.635,80,0.0000,5,27,32
3,springer,0,0,1,9.84,14.395,75,0.0000,3,10,13
4,springer,0,0,1,9.84,14.395,75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
10881,winter,0,1,1,15.58,19.695,50,26.0027,7,329,336
10882,winter,0,1,1,14.76,17.425,57,15.0013,10,231,241
10883,winter,0,1,1,13.94,15.910,61,15.0013,4,164,168
10884,winter,0,1,1,13.94,17.425,61,6.0032,12,117,129


### Scale the count of casual users in the range of 1 to 5

In [47]:
bike_casual = np.array(bike['casual']).reshape(-1,1)
minmax_scale = preprocessing.MinMaxScaler(feature_range=[1,5])
scaled_feature = minmax_scale.fit_transform(bike_casual)
scaled_feature

array([[1.03269755],
       [1.08719346],
       [1.05449591],
       ...,
       [1.04359673],
       [1.13079019],
       [1.04359673]])

### Scale the count of registered users with variance as 1 and mean as 0


In [48]:
bike_registered = np.array(bike['registered']).reshape(-1,1)
std_scaler = preprocessing.StandardScaler()
standarized = std_scaler.fit_transform(bike_registered)
standarized

array([[-0.94385353],
       [-0.81805246],
       [-0.851158  ],
       ...,
       [ 0.05593396],
       [-0.25525818],
       [-0.47375478]])

### Find the outliers with the contamination as 0.2

In [49]:
outlier_detector = EllipticEnvelope(contamination = 0.2)
outlier_detector.fit(np.array(bike['registered']).reshape(-1,1))
pre = outlier_detector.predict(np.array(bike['registered']).reshape(-1,1))
np.where(pre==-1)

(array([  813,   822,   846, ..., 10879, 10880, 10881], dtype=int64),)

### Convert the humidity into binary from the discrete value. If the humidity is below 0.5, it should be 0 else 1

In [50]:
humidity = np.where(bike['humidity']<0.5,0,1)
humidity

array([1, 1, 1, ..., 1, 1, 1])

### Classify the column of windspeed intodifferent classes based on the value. The bins should be 0.25,0.50,0.75

In [51]:
bike['windspeed_bins'] = pd.cut(bike['windspeed'], [0,25,50,75])
bike.head(20)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,windspeed_bins
0,springer,0,0,1,9.84,14.395,81,0.0,3,13,16,
1,springer,0,0,1,9.02,13.635,80,0.0,8,32,40,
2,springer,0,0,1,9.02,13.635,80,0.0,5,27,32,
3,springer,0,0,1,9.84,14.395,75,0.0,3,10,13,
4,springer,0,0,1,9.84,14.395,75,0.0,0,1,1,
5,springer,0,0,2,9.84,12.88,75,6.0032,0,1,1,"(0.0, 25.0]"
6,springer,0,0,1,9.02,13.635,80,0.0,2,0,2,
7,springer,0,0,1,8.2,12.88,86,0.0,1,2,3,
8,springer,0,0,1,9.84,14.395,75,0.0,1,7,8,
9,springer,0,0,1,13.12,17.425,76,0.0,8,6,14,


### Apply PCA to last 9 attributes and engineer it to 2 attributes

In [53]:
bike_pca = bike.drop(['season', 'holiday', 'windspeed_bins'],axis=1)
bike_pca

Unnamed: 0,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,0,1,9.84,14.395,81,0.0000,3,13,16
1,0,1,9.02,13.635,80,0.0000,8,32,40
2,0,1,9.02,13.635,80,0.0000,5,27,32
3,0,1,9.84,14.395,75,0.0000,3,10,13
4,0,1,9.84,14.395,75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...
10881,1,1,15.58,19.695,50,26.0027,7,329,336
10882,1,1,14.76,17.425,57,15.0013,10,231,241
10883,1,1,13.94,15.910,61,15.0013,4,164,168
10884,1,1,13.94,17.425,61,6.0032,12,117,129


In [54]:
min_max_scaler = preprocessing.MinMaxScaler(feature_range=[0, 1])
scaler = min_max_scaler.fit_transform(bike_pca)
pca = PCA(n_components=0.99)
features_pca = pca.fit_transform(scaler)

### Compare the total number of attributes before PCA and after PCA

In [56]:
print('Original features: ',bike_pca.shape[1])
print('Reduced features: ',features_pca.shape[1])

Original features:  9
Reduced features:  7
