In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('./datasets/household_power_consumption.txt', sep=';', 
                 usecols=[2],na_values=['?','nan'], low_memory = False)
l = df[df['Global_active_power'].isnull()].index.tolist()

In [3]:
df.head()

Unnamed: 0,Global_active_power
0,4.216
1,5.36
2,5.374
3,5.388
4,3.666


In [4]:
data = df.values
data

array([[4.216],
       [5.36 ],
       [5.374],
       ...,
       [0.938],
       [0.934],
       [0.932]])

In [5]:
data = [j for sub in data for j in sub]
data = [i.tolist() for i in data]
np.shape(data)

(2075259,)

In [6]:
trainX = list()
trainY = list()
for i in range(len(data)-60):
    data_check = np.isnan(data[i:i+61])
    if not np.sum(data_check):
        trainX.append(data[i:i+60])
        trainY.append(data[i+60])
trainX = np.array(trainX, dtype='float32')
trainY = np.array(trainY,  dtype='float32')

In [7]:
train_sample , test_sample = train_test_split(np.array(range(trainX.shape[0])))
testX,testY = trainX[test_sample],trainY[test_sample]
trainX, trainY = trainX[train_sample],trainY[train_sample]
testX.shape, trainX.shape, testY.shape, trainY.shape

((511278, 60), (1533832, 60), (511278,), (1533832,))

In [8]:
reg = LinearRegression()
reg.fit(trainX, trainY)
preds = reg.predict(testX)

In [9]:
r2_score(testY, preds)

0.9391456298120375

In [10]:
mean_squared_error(testY, preds)

0.068067916

In [11]:
def percentage_error(actual, predicted):
    res = np.empty(actual.shape)
    for j in range(actual.shape[0]):
        if actual[j] != 0:
            res[j] = (actual[j] - predicted[j]) / actual[j]
        else:
            res[j] = predicted[j] / np.mean(actual)
    return res

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs(percentage_error(np.asarray(y_true), np.asarray(y_pred)))) * 100
print("Percentage Error : ", percentage_error(testY, preds))
print("Mean Absolute Percentage Error : ", mean_absolute_percentage_error(testY, preds))

Percentage Error :  [ 0.0512917   0.6312843  -0.02688237 ... -0.20801824 -0.09994718
 -0.07425408]
Mean Absolute Percentage Error :  10.776718316035689


In [12]:
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras import Sequential
import matplotlib.pyplot as plt
import math

look_back = 60
model = Sequential()
model.add(Dense(200, input_dim=look_back, activation='relu'))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='SGD')
model.fit(trainX, trainY, epochs=1, batch_size=128, verbose=2)
trainScore = model.evaluate(trainX, trainY, verbose=0)
print('Train Score: %.2f MSE (%.2f RMSE)' % (trainScore, math.sqrt(trainScore)))
testScore = model.evaluate(testX, testY, verbose=0)
print('Test Score: %.2f MSE (%.2f RMSE)' % (testScore, math.sqrt(testScore)))
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)

Train on 1533832 samples
1533832/1533832 - 16s - loss: 0.0803
Train Score: 0.07 MSE (0.26 RMSE)
Test Score: 0.07 MSE (0.26 RMSE)


In [14]:
testPredict = testPredict.ravel()
testPredict.shape

(511278,)

In [15]:

r2_score(testY, testPredict)

0.9390608630952686

In [16]:
mean_squared_error(testY, testPredict)

0.06816274

In [17]:
def percentage_error(actual, predicted):
    res = np.empty(actual.shape)
    for j in range(actual.shape[0]):
        if actual[j] != 0:
            res[j] = (actual[j] - predicted[j]) / actual[j]
        else:
            res[j] = predicted[j] / np.mean(actual)
    return res

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs(percentage_error(np.asarray(y_true), np.asarray(y_pred)))) * 100
print("Percentage Error : ", percentage_error(testY, testPredict))
print("Mean Absolute Percentage Error : ", mean_absolute_percentage_error(testY, testPredict))

Percentage Error :  [ 0.06105394  0.61942768 -0.01856849 ... -0.08559883 -0.08511136
 -0.06958736]
Mean Absolute Percentage Error :  10.723691565797026


In [18]:
data=np.array(data)
nan_vals = list()
for i in l:
    testX = data[i-60:i]
    data[i] = model(testX.reshape(1,-1))[0][0]
    nan_vals.append(data[i])



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



In [19]:
len(nan_vals)

25979

In [20]:
nan_vals

[0.2669679522514343,
 0.2673649787902832,
 6.032180309295654,
 6.037687301635742,
 3.193481206893921,
 2.123649835586548,
 2.963982105255127,
 2.857618570327759,
 0.36710473895072937,
 0.47474685311317444,
 0.4845164120197296,
 0.4893534481525421,
 0.4894837737083435,
 0.499148428440094,
 0.4834160804748535,
 0.49710458517074585,
 0.5046551823616028,
 0.4816213548183441,
 0.47759127616882324,
 0.47975707054138184,
 0.4741268754005432,
 0.4718574285507202,
 0.4632280468940735,
 0.4498071074485779,
 0.4568777084350586,
 0.46614551544189453,
 0.46877557039260864,
 0.4509735405445099,
 0.4315040707588196,
 0.4169512093067169,
 0.40898391604423523,
 0.39818426966667175,
 0.39058494567871094,
 0.38095539808273315,
 0.37782448530197144,
 0.39711496233940125,
 0.39563822746276855,
 0.399202436208725,
 0.3898947536945343,
 0.39402472972869873,
 0.3936547636985779,
 0.3916507959365845,
 0.41008949279785156,
 0.4359184801578522,
 0.45591747760772705,
 0.47075188159942627,
 0.4689137935638428,
 0.

In [19]:
!jupyter nbconvert --to html q4.ipynb

[NbConvertApp] Converting notebook q4.ipynb to html
[NbConvertApp] Writing 325763 bytes to q4.html
