In [1]:
from sklearn import preprocessing
import pandas as pd
import numpy as np

In [2]:
complete_df = pd.read_csv('./dataset/data_complete.csv')
training_df = pd.read_csv('./dataset/training.csv')
testing_df = pd.read_csv('./dataset/testing.csv')

In [4]:
complete_df.dtypes

date            object
Appliances       int64
lights           int64
T1             float64
RH_1           float64
T2             float64
RH_2           float64
T3             float64
RH_3           float64
T4             float64
RH_4           float64
T5             float64
RH_5           float64
T6             float64
RH_6           float64
T7             float64
RH_7           float64
T8             float64
RH_8           float64
T9             float64
RH_9           float64
T_out          float64
Press_mm_hg    float64
RH_out         float64
Windspeed      float64
Visibility     float64
Tdewpoint      float64
rv1            float64
rv2            float64
dtype: object

In [5]:
training_df.dtypes

date            object
Appliances       int64
lights           int64
T1             float64
RH_1           float64
T2             float64
RH_2           float64
T3             float64
RH_3           float64
T4             float64
RH_4           float64
T5             float64
RH_5           float64
T6             float64
RH_6           float64
T7             float64
RH_7           float64
T8             float64
RH_8           float64
T9             float64
RH_9           float64
T_out          float64
Press_mm_hg    float64
RH_out         float64
Windspeed      float64
Visibility     float64
Tdewpoint      float64
rv1            float64
rv2            float64
NSM              int64
WeekStatus      object
Day_of_week     object
dtype: object

In [6]:
testing_df.dtypes

date            object
Appliances       int64
lights           int64
T1             float64
RH_1           float64
T2             float64
RH_2           float64
T3             float64
RH_3           float64
T4             float64
RH_4           float64
T5             float64
RH_5           float64
T6             float64
RH_6           float64
T7             float64
RH_7           float64
T8             float64
RH_8           float64
T9             float64
RH_9           float64
T_out          float64
Press_mm_hg    float64
RH_out         float64
Windspeed      float64
Visibility     float64
Tdewpoint      float64
rv1            float64
rv2            float64
NSM              int64
WeekStatus      object
Day_of_week     object
dtype: object

In [16]:
numeric_data_complete = complete_df.select_dtypes(include=['float64','int64'])
categorical_columns_complete = complete_df.select_dtypes(include=['object'])

### Standardization

In [37]:
#Method_1
#
#numeric_data_complete
complete = np.array(numeric_data_complete)
# calculate mean  
complete_mean = complete.mean(axis=0)  
# calculate variance   
complete_std = complete.std(axis=0)  
# standardize  
complete1 = (complete-complete_mean)/complete_std  
# use function preprocessing.scale to standardize 
complete_scale = preprocessing.scale(complete) 

In [38]:
complete_scale

array([[-0.36767572,  3.30126384, -1.11864475, ...,  0.3669753 ,
        -0.80797358, -0.80797358],
       [-0.36767572,  3.30126384, -1.11864475, ...,  0.34313479,
        -0.44024015, -0.44024015],
       [-0.46521548,  3.30126384, -1.11864475, ...,  0.31929428,
         0.25210868,  0.25210868],
       ...,
       [ 1.68065927,  0.78103476,  2.37445166, ...,  2.26626907,
         0.29049435,  0.29049435],
       [ 3.14375569,  0.78103476,  2.37445166, ...,  2.25832223,
        -1.28759013, -1.28759013],
       [ 3.24129545,  0.78103476,  2.37445166, ...,  2.2503754 ,
         0.6298737 ,  0.6298737 ]])

In [40]:
#Method_2
X = np.array(numeric_data_complete)  
scaler = preprocessing.StandardScaler()  
X_scaled = scaler.fit_transform(X)

In [41]:
X_scaled

array([[-0.36767572,  3.30126384, -1.11864475, ...,  0.3669753 ,
        -0.80797358, -0.80797358],
       [-0.36767572,  3.30126384, -1.11864475, ...,  0.34313479,
        -0.44024015, -0.44024015],
       [-0.46521548,  3.30126384, -1.11864475, ...,  0.31929428,
         0.25210868,  0.25210868],
       ...,
       [ 1.68065927,  0.78103476,  2.37445166, ...,  2.26626907,
         0.29049435,  0.29049435],
       [ 3.14375569,  0.78103476,  2.37445166, ...,  2.25832223,
        -1.28759013, -1.28759013],
       [ 3.24129545,  0.78103476,  2.37445166, ...,  2.2503754 ,
         0.6298737 ,  0.6298737 ]])

### Scaling features to a range

In [45]:
min_max_scaler = preprocessing.MinMaxScaler(feature_range=( -1, 1))
complete_minmax = min_max_scaler.fit_transform(complete)

In [46]:
complete_minmax

array([[-0.90654206, -0.14285714, -0.34530095, ...,  0.07692308,
        -0.46910219, -0.46910219],
       [-0.90654206, -0.14285714, -0.34530095, ...,  0.0678733 ,
        -0.25583421, -0.25583421],
       [-0.92523364, -0.14285714, -0.34530095, ...,  0.05882353,
         0.14569532,  0.14569532],
       ...,
       [-0.51401869, -0.71428571,  0.83949314, ...,  0.79788839,
         0.16795719,  0.16795719],
       [-0.23364486, -0.71428571,  0.83949314, ...,  0.79487179,
        -0.74725708, -0.74725708],
       [-0.21495327, -0.71428571,  0.83949314, ...,  0.7918552 ,
         0.36478114,  0.36478114]])

### Normalization

In [48]:
#Method_1
complete_normalized = preprocessing.normalize(complete, norm='l2')

In [49]:
complete_normalized

array([[0.07858774, 0.03929387, 0.02605184, ..., 0.00694192, 0.01738811,
        0.01738811],
       [0.07859288, 0.03929644, 0.02605354, ..., 0.00681138, 0.02437191,
        0.02437191],
       [0.06553308, 0.03931985, 0.02606906, ..., 0.00668437, 0.03754084,
        0.03754084],
       ...,
       [0.32931037, 0.01219668, 0.03110154, ..., 0.01618093, 0.03561323,
        0.03561323],
       [0.47736441, 0.01136582, 0.02898284, ..., 0.01504077, 0.00718636,
        0.00718636],
       [0.48538035, 0.01128792, 0.02878418, ..., 0.01490005, 0.03851307,
        0.03851307]])

In [52]:
#Method_2
normalizer = preprocessing.Normalizer().fit(complete)  # fit does nothing 

In [53]:
normalizer

Normalizer(copy=True, norm='l2')

In [54]:
normalizer.transform(complete)

array([[0.07858774, 0.03929387, 0.02605184, ..., 0.00694192, 0.01738811,
        0.01738811],
       [0.07859288, 0.03929644, 0.02605354, ..., 0.00681138, 0.02437191,
        0.02437191],
       [0.06553308, 0.03931985, 0.02606906, ..., 0.00668437, 0.03754084,
        0.03754084],
       ...,
       [0.32931037, 0.01219668, 0.03110154, ..., 0.01618093, 0.03561323,
        0.03561323],
       [0.47736441, 0.01136582, 0.02898284, ..., 0.01504077, 0.00718636,
        0.00718636],
       [0.48538035, 0.01128792, 0.02878418, ..., 0.01490005, 0.03851307,
        0.03851307]])

Binarization

In [77]:
training_df['WeekStatus_Binarization'] = ''

In [78]:
training_df

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,NSM,WeekStatus,Day_of_week,WeekStatus_Binarization
0,2016-01-11 17:00:00,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,...,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433,61200,Weekday,Monday,
1,2016-01-11 17:10:00,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,...,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195,61800,Weekday,Monday,
2,2016-01-11 17:20:00,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,...,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668,62400,Weekday,Monday,
3,2016-01-11 17:40:00,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,...,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097,63600,Weekday,Monday,
4,2016-01-11 17:50:00,50,40,19.890000,46.026667,19.200000,44.500000,19.790000,44.933333,18.890000,...,92.000000,5.333333,43.833333,4.800000,44.919484,44.919484,64200,Weekday,Monday,
5,2016-01-11 18:10:00,60,50,19.856667,45.560000,19.200000,44.500000,19.730000,44.900000,18.890000,...,91.833333,5.166667,40.000000,4.683333,33.039890,33.039890,65400,Weekday,Monday,
6,2016-01-11 18:20:00,60,40,19.790000,45.597500,19.200000,44.433333,19.730000,44.790000,18.890000,...,91.666667,5.333333,40.000000,4.666667,31.455702,31.455702,66000,Weekday,Monday,
7,2016-01-11 18:30:00,70,40,19.856667,46.090000,19.230000,44.400000,19.790000,44.863333,18.890000,...,91.500000,5.500000,40.000000,4.650000,3.089314,3.089314,66600,Weekday,Monday,
8,2016-01-11 19:00:00,430,50,20.133333,48.000000,19.566667,44.400000,19.890000,44.900000,19.000000,...,91.000000,6.000000,40.000000,4.600000,34.351142,34.351142,68400,Weekday,Monday,
9,2016-01-11 19:10:00,250,40,20.260000,52.726667,19.730000,45.100000,19.890000,45.493333,19.000000,...,90.500000,6.000000,40.000000,4.516667,19.205186,19.205186,69000,Weekday,Monday,


In [None]:
fol row in training_df.rows:
    

In [84]:
training_df.rows

AttributeError: 'DataFrame' object has no attribute 'rows'