In [1]:
import pandas as pd

## Dealing with Messy Data

In [2]:
data = pd.read_csv("energydata_complete.csv")
data = data.drop(columns=["date"])
data.head()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [3]:
data.iloc[0,:]

Appliances      60.000000
lights          30.000000
T1              19.890000
RH_1            47.596667
T2              19.200000
RH_2            44.790000
T3              19.790000
RH_3            44.730000
T4              19.000000
RH_4            45.566667
T5              17.166667
RH_5            55.200000
T6               7.026667
RH_6            84.256667
T7              17.200000
RH_7            41.626667
T8              18.200000
RH_8            48.900000
T9              17.033333
RH_9            45.530000
T_out            6.600000
Press_mm_hg    733.500000
RH_out          92.000000
Windspeed        7.000000
Visibility      63.000000
Tdewpoint        5.300000
rv1             13.275433
rv2             13.275433
Name: 0, dtype: float64

In [4]:
data.isnull().sum()

Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64

In [5]:
outliers = {}
for i in range(data.shape[1]):
    min_t = data[data.columns[i]].mean() - (3 * data[data.columns[i]].std())
    max_t = data[data.columns[i]].mean() + (3 * data[data.columns[i]].std())
    count = 0
    for j in data[data.columns[i]]:
        if j < min_t or j > max_t:
            count += 1
    percentage = count/data.shape[0]
    outliers[data.columns[i]] = "%.3f" % percentage

outliers

{'Appliances': '0.027',
 'lights': '0.033',
 'T1': '0.001',
 'RH_1': '0.006',
 'T2': '0.010',
 'RH_2': '0.007',
 'T3': '0.003',
 'RH_3': '0.001',
 'T4': '0.000',
 'RH_4': '0.000',
 'T5': '0.001',
 'RH_5': '0.029',
 'T6': '0.005',
 'RH_6': '0.000',
 'T7': '0.000',
 'RH_7': '0.001',
 'T8': '0.000',
 'RH_8': '0.000',
 'T9': '0.000',
 'RH_9': '0.000',
 'T_out': '0.005',
 'Press_mm_hg': '0.005',
 'RH_out': '0.008',
 'Windspeed': '0.005',
 'Visibility': '0.002',
 'Tdewpoint': '0.000',
 'rv1': '0.000',
 'rv2': '0.000'}

## Data Rescaling

In [6]:
X = data.iloc[:,1:]
Y = data.iloc[:,0]

In [7]:
X = (X - X.min())/(X.max() - X.min())
X.head()

Unnamed: 0,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.428571,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.428571,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083
2,0.428571,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.571429,0.32735,0.52408,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.571429,0.32735,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611


## Splitting the Dataset

In [8]:
X.shape

(19735, 27)

In [9]:
train_end = int(len(X) * 0.6)
dev_end = int(len(X) * 0.8)

In [10]:
x_train = X.iloc[:train_end,:]
y_train = Y.iloc[:train_end]
x_dev = X.iloc[train_end:dev_end,:]
y_dev = Y.iloc[train_end:dev_end]
x_test = X.iloc[dev_end:,:]
y_test = Y.iloc[dev_end:]

In [11]:
print(x_train.shape, y_train.shape)
print(x_dev.shape, y_dev.shape)
print(x_test.shape, y_test.shape)

(11841, 27) (11841,)
(3947, 27) (3947,)
(3947, 27) (3947,)


## Building a Deep Neural Network

In [12]:
import torch
import torch.nn as nn

In [13]:
x_train = torch.tensor(x_train.values).float()
y_train = torch.tensor(y_train.values).float()

x_dev = torch.tensor(x_dev.values).float()
y_dev = torch.tensor(y_dev.values).float()

x_test = torch.tensor(x_test.values).float()
y_test = torch.tensor(y_test.values).float()

In [21]:
model = nn.Sequential(nn.Linear(x_train.shape[1],100),
                      nn.ReLU(),
                      
                      nn.Linear(100,50),
                      nn.ReLU(),
                      
                      nn.Linear(50,25),
                      nn.ReLU(),
                     
                      nn.Linear(25,1))

In [22]:
loss_function = torch.nn.MSELoss()

In [23]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [24]:
for i in range(100):
    y_pred = model(x_train)
    loss = loss_function(y_pred, y_train)
    print(i, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 20684.015625
1 20652.1796875
2 20599.068359375
3 20495.044921875
4 20304.099609375
5 19984.173828125
6 19476.806640625
7 18735.81640625
8 17700.69140625
9 16371.310546875
10 14760.0263671875
11 13077.4111328125
12 11783.3447265625
13 11756.220703125
14 13276.3583984375
15 13794.2900390625
16 13108.7158203125
17 12141.033203125
18 11608.7724609375
19 11567.6572265625
20 11802.2734375
21 12093.052734375
22 12309.8681640625
23 12395.875
24 12357.9814453125
25 12203.8076171875
26 11979.634765625
27 11753.5205078125
28 11584.068359375
29 11539.02734375
30 11617.890625
31 11763.0439453125
32 11867.220703125
33 11841.7685546875
34 11731.5859375
35 11608.091796875
36 11539.0849609375
37 11528.7880859375
38 11560.19140625
39 11620.7734375
40 11646.23828125
41 11658.3671875
42 11631.09375
43 11581.7958984375
44 11534.3955078125
45 11507.6044921875
46 11516.150390625
47 11537.7685546875
48 11560.3125
49 11561.4384765625
50 11553.5126953125
51 11530.1884765625
52 11514.4794921875
53 11496.824218

In [25]:
pred = model(x_test[0])
print(y_test[0],pred)

tensor(370.) tensor([74.0591], grad_fn=<AddBackward0>)
