In [1]:
import pandas as pd

## Dealing with Messy Data

In [2]:
data = pd.read_csv("energydata_complete.csv")
data = data.drop(columns=["date"])
data.head()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [3]:
cols = data.columns

num_cols = data._get_numeric_data().columns

list(set(cols) - set(num_cols))

[]

In [4]:
data.isnull().sum()

Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64

In [5]:
outliers = {}
for i in range(data.shape[1]):
    min_t = data[data.columns[i]].mean() - (3 * data[data.columns[i]].std())
    max_t = data[data.columns[i]].mean() + (3 * data[data.columns[i]].std())
    count = 0
    for j in data[data.columns[i]]:
        if j < min_t or j > max_t:
            count += 1
    percentage = count/data.shape[0]
    outliers[data.columns[i]] = "%.3f" % percentage

outliers

{'Appliances': '0.027',
 'lights': '0.033',
 'T1': '0.001',
 'RH_1': '0.006',
 'T2': '0.010',
 'RH_2': '0.007',
 'T3': '0.003',
 'RH_3': '0.001',
 'T4': '0.000',
 'RH_4': '0.000',
 'T5': '0.001',
 'RH_5': '0.029',
 'T6': '0.005',
 'RH_6': '0.000',
 'T7': '0.000',
 'RH_7': '0.001',
 'T8': '0.000',
 'RH_8': '0.000',
 'T9': '0.000',
 'RH_9': '0.000',
 'T_out': '0.005',
 'Press_mm_hg': '0.005',
 'RH_out': '0.008',
 'Windspeed': '0.005',
 'Visibility': '0.002',
 'Tdewpoint': '0.000',
 'rv1': '0.000',
 'rv2': '0.000'}

## Data Rescaling

In [6]:
X = data.iloc[:,1:]
Y = data.iloc[:,0]

In [7]:
X = (X - X.min())/(X.max() - X.min())
X.head()

Unnamed: 0,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.428571,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.428571,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083
2,0.428571,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.571429,0.32735,0.52408,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.571429,0.32735,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611


## Splitting the Dataset

In [8]:
X.shape

(19735, 27)

In [9]:
train_end = int(len(X) * 0.6)
dev_end = int(len(X) * 0.8)

In [10]:
X_shuffle = X.sample(frac=1)
Y_shuffle = Y.sample(frac=1)

In [11]:
x_train = X_shuffle.iloc[:train_end,:]
y_train = Y_shuffle.iloc[:train_end]
x_dev = X_shuffle.iloc[train_end:dev_end,:]
y_dev = Y_shuffle.iloc[train_end:dev_end]
x_test = X_shuffle.iloc[dev_end:,:]
y_test = Y_shuffle.iloc[dev_end:]

In [12]:
print(x_train.shape, y_train.shape)
print(x_dev.shape, y_dev.shape)
print(x_test.shape, y_test.shape)

(11841, 27) (11841,)
(3947, 27) (3947,)
(3947, 27) (3947,)


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_new, x_test_2, y_new, y_test_2 = train_test_split(X_shuffle, Y_shuffle, test_size=0.2, random_state=0)
dev_per = x_test_2.shape[0]/x_new.shape[0]
x_train_2, x_dev_2, y_train_2, y_dev_2 = train_test_split(x_new, y_new, test_size=dev_per, random_state=0)

In [15]:
print(x_train_2.shape, y_train_2.shape)
print(x_dev_2.shape, y_dev_2.shape)
print(x_test_2.shape, y_test_2.shape)

(11841, 27) (11841,)
(3947, 27) (3947,)
(3947, 27) (3947,)


## Building a Deep Neural Network

In [16]:
import torch
import torch.nn as nn

In [17]:
x_train = torch.tensor(x_train.values).float()
y_train = torch.tensor(y_train.values).float()

x_dev = torch.tensor(x_dev.values).float()
y_dev = torch.tensor(y_dev.values).float()

x_test = torch.tensor(x_test.values).float()
y_test = torch.tensor(y_test.values).float()

In [18]:
model = nn.Sequential(nn.Linear(x_train.shape[1],100),
                      nn.ReLU(),
                      
                      nn.Linear(100,50),
                      nn.ReLU(),
                      
                      nn.Linear(50,25),
                      nn.ReLU(),
                     
                      nn.Linear(25,1))

In [19]:
loss_function = torch.nn.MSELoss()

In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [21]:
for i in range(100):
    y_pred = model(x_train)
    loss = loss_function(y_pred, y_train)
    print(i, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 21120.103515625
1 21070.6875
2 21028.091796875
3 20949.494140625
4 20811.826171875
5 20590.697265625
6 20254.576171875
7 19769.837890625
8 19102.986328125
9 18225.1171875
10 17120.0
11 15799.228515625
12 14327.5625
13 12865.1787109375
14 11733.8076171875
15 11469.779296875
16 12446.345703125
17 13476.6708984375
18 13431.90625
19 12710.1611328125
20 11950.2138671875
21 11503.6875
22 11417.8603515625
23 11570.599609375
24 11809.2587890625
25 12022.1298828125
26 12149.9248046875
27 12173.0478515625
28 12097.6181640625
29 11946.8369140625
30 11756.5546875
31 11571.5556640625
32 11438.6083984375
33 11393.193359375
34 11441.2666015625
35 11546.8857421875
36 11643.6494140625
37 11673.4541015625
38 11623.5087890625
39 11527.9111328125
40 11437.2060546875
41 11386.501953125
42 11383.021484375
43 11412.2451171875
44 11451.2646484375
45 11479.9814453125
46 11486.814453125
47 11470.1884765625
48 11437.3037109375
49 11400.7275390625
50 11373.6455078125
51 11364.7109375
52 11373.853515625
53 11391

In [22]:
pred = model(x_test[0])
print(y_test[0],pred)

tensor(120.) tensor([85.7024], grad_fn=<AddBackward0>)
