In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

## We read the data from the given csv file

In [0]:
data = pd.read_csv("/content/drive/My Drive/SMAI_Assignment3_Dataset/dataset_q4/household_power_consumption.txt", sep=';')
data
X = pd.DataFrame()
X['row_no'] = np.arange(len(data))
X['data'] = data['Global_active_power']

# Window size = 60

## We train our dataset for 50,000 samples using a window size of 60. In this phase, we ignore the rows having missing values.

In [0]:
window_size = 60
X2 = np.array([0] * (window_size + 1))

count = 1
while(count<=50000):
  row = random.randrange(len(X) - window_size - 1)
  temp = X[row:row+window_size+1]['data'].tolist()
  if('?' not in temp):
    temp = [float(item) for item in temp]
    X2 = np.vstack((X2,temp)) 
    count = count + 1

X2 = X2[1:,:]
print(X2)
print(X2.shape)

  

[[0.4   0.4   0.4   ... 0.428 0.43  0.428]
 [0.292 0.29  0.288 ... 0.43  0.434 0.528]
 [0.302 0.242 0.232 ... 0.212 0.212 0.212]
 ...
 [0.264 0.266 0.266 ... 1.656 1.76  2.688]
 [2.26  2.266 2.448 ... 2.412 2.338 2.31 ]
 [0.32  0.314 0.316 ... 0.422 0.42  0.418]]
(50000, 61)


## We split our data into train and test sets.

In [0]:
X_train = X2[0:35000,0:60]
y_train = X2[0:35000,60]

X_test = X2[35000:,0:60]
y_test = X2[35000:,60]
print(X_train)
print(y_train)

[[0.4   0.4   0.4   ... 0.432 0.428 0.43 ]
 [0.292 0.29  0.288 ... 0.424 0.43  0.434]
 [0.302 0.242 0.232 ... 0.212 0.212 0.212]
 ...
 [2.024 2.282 2.282 ... 1.88  1.888 1.876]
 [0.358 2.434 2.57  ... 2.902 3.882 4.952]
 [1.384 1.384 1.384 ... 0.292 0.252 0.22 ]]
[0.428 0.528 0.212 ... 1.876 4.946 0.22 ]


## Linear Regression model 1

In [0]:
from sklearn.linear_model import LinearRegression

model_reg = LinearRegression()
model_reg.fit(X_train, y_train)
y_pred_reg = model_reg.predict(X_test)

mse_reg = mean_squared_error(y_test, y_pred_reg)
print('Mean squared error: ', mse_reg)

r2_reg = r2_score(y_test, y_pred_reg)
print('r2 score: ', r2_reg)

Mean squared error:  0.0653094713737575
r2 score:  0.9429255892493377


## Multi Layer Perceptron (MLP)

## MLP model 1
## In this model, we use MLP having one hidden layer of 100 nodes using activation function 'relu'

In [0]:
model_nn = Sequential()
model_nn.add(Dense(100, input_dim=60, activation='relu'))
model_nn.add(Dense(1))
model_nn.compile(optimizer='adam', loss='mse', metrics=['mse'])
model_nn.fit(X_train, y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f27259bc320>

In [0]:
y_pred_nn1 = model_nn.predict(X_test)
mse_nn1 = mean_squared_error(y_test, y_pred_nn1)
print('Mean squared error: ', mse_nn1)

r2_nn1 = r2_score(y_test, y_pred_nn1)
print('r2 score: ', r2_nn1)

Mean squared error:  0.06743341996900627
r2 score:  0.941069455491268


## MLP model 2
### In this model we add another hidden layer in our model and test the data. 

In [0]:
model_nn1 = Sequential()
model_nn1.add(Dense(100, input_dim=60, activation='relu'))
model_nn1.add(Dense(100, input_dim=60, activation='relu'))
model_nn1.add(Dense(1))
model_nn1.compile(optimizer='adam', loss='mse', metrics=['mse'])
model_nn1.fit(X_train, y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f273bcccd68>

In [0]:
y_pred_nn2 = model_nn1.predict(X_test)
mse_nn2 = mean_squared_error(y_test, y_pred_nn2)
print('Mean squared error: ', mse_nn2)

r2_nn2 = r2_score(y_test, y_pred_nn2)
print('r2 score: ', r2_nn2)

Mean squared error:  0.069224194734125
r2 score:  0.9395044847089845


## MLP model 3
### In this model we change the activation function to 'sigmoid'

In [0]:
model_nn3 = Sequential()
model_nn3.add(Dense(100, input_dim=60, activation='sigmoid'))
model_nn3.add(Dense(1))
model_nn3.compile(optimizer='adam', loss='mse', metrics=['mse'])
model_nn3.fit(X_train, y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f272cd9f400>

In [0]:
y_pred_nn3 = model_nn3.predict(X_test)
mse_nn3 = mean_squared_error(y_test, y_pred_nn3)
print('Mean squared error: ', mse_nn3)

r2_nn3 = r2_score(y_test, y_pred_nn3)
print('r2 score: ', r2_nn3)

Mean squared error:  0.076219229660009
r2 score:  0.9314961160221193


# Summary of the three models

In [0]:
lr_l1 = ["Linear Regression model 1", mse_reg, r2_reg]
mlp_l2 = ["MLP model 1", mse_nn1, r2_nn1]
mlp_l3 = ["MLP model 2", mse_nn2, r2_nn2]
mlp_l4 = ["MLP model 3", mse_nn3, r2_nn3]
data = [lr_l1, mlp_l2, mlp_l3, mlp_l4]
df = pd.DataFrame(data, columns = ['Model', 'Mean squared error', 'r2 score'])
df

Unnamed: 0,Model,Mean squared error,r2 score
0,Linear Regression model 1,0.065309,0.942926
1,MLP model 1,0.067433,0.941069
2,MLP model 2,0.069224,0.939504
3,MLP model 3,0.076219,0.931496


# Window size = 120

## We train our dataset for 50,000 samples using a window size of 120. In this phase, we ignore the rows having missing values.

In [0]:
window_size = 120
X2 = np.array([0] * (window_size + 1))

count = 1
while(count<=30000):
  row = random.randrange(len(X) - window_size - 1)
  temp = X[row:row+window_size+1]['data'].tolist()
  if('?' not in temp):
    temp = [float(item) for item in temp]
    X2 = np.vstack((X2,temp)) 
    count = count + 1

X2 = X2[1:,:]
print(X2)
print(X2.shape)

[[1.396 1.394 1.396 ... 1.188 1.186 1.184]
 [1.64  1.636 1.64  ... 1.414 1.41  1.412]
 [0.966 0.984 1.018 ... 0.236 0.236 0.236]
 ...
 [0.222 0.23  0.21  ... 1.234 1.24  1.238]
 [0.4   0.328 0.306 ... 0.39  0.388 0.43 ]
 [0.516 1.702 2.514 ... 1.502 1.422 1.412]]
(30000, 121)


## We split our data into train and test sets.

In [0]:
X_train = X2[0:21000,0:120]
y_train = X2[0:21000,120]

X_test = X2[21000:,0:120]
y_test = X2[21000:,120]
print(X_train)
print(y_train)

[[1.396 1.394 1.396 ... 1.188 1.188 1.186]
 [1.64  1.636 1.64  ... 1.406 1.414 1.41 ]
 [0.966 0.984 1.018 ... 0.254 0.236 0.236]
 ...
 [3.542 3.522 3.522 ... 1.814 1.79  1.806]
 [0.37  0.386 0.348 ... 0.326 0.306 0.322]
 [0.746 0.752 0.756 ... 1.772 1.882 1.936]]
[1.184 1.412 0.236 ... 1.792 0.302 1.932]


## Linear Regression model 2

In [0]:
from sklearn.linear_model import LinearRegression

model_reg2 = LinearRegression()
model_reg2.fit(X_train, y_train)
y_pred_reg2 = model_reg2.predict(X_test)

mse_reg2 = mean_squared_error(y_test, y_pred_reg2)
print('Mean squared error: ', mse_reg2)

r2_reg2 = r2_score(y_test, y_pred_reg2)
print('r2 score: ', r2_reg2)

Mean squared error:  0.0737459290811239
r2 score:  0.9337122808870182


## Multi Layer Perceptron (MLP)

## MLP model 4
## In this model, we use MLP having one hidden layer of 100 nodes using activation function 'relu'

In [0]:
model_nn4 = Sequential()
model_nn4.add(Dense(100, input_dim=120, activation='relu'))
model_nn4.add(Dense(1))
model_nn4.compile(optimizer='adam', loss='mse', metrics=['mse'])
model_nn4.fit(X_train, y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f273aec0240>

In [0]:
y_pred_nn4 = model_nn4.predict(X_test)
mse_nn4 = mean_squared_error(y_test, y_pred_nn4)
print('Mean squared error: ', mse_nn4)

r2_nn4 = r2_score(y_test, y_pred_nn4)
print('r2 score: ', r2_nn4)

Mean squared error:  0.07395400051487365
r2 score:  0.9335252525191111


## MLP model 5
### In this model we add another hidden layer in our model and test the data. 

In [0]:
model_nn5 = Sequential()
model_nn5.add(Dense(100, input_dim=120, activation='relu'))
model_nn5.add(Dense(100, input_dim=120, activation='relu'))
model_nn5.add(Dense(1))
model_nn5.compile(optimizer='adam', loss='mse', metrics=['mse'])
model_nn5.fit(X_train, y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f272d0d60f0>

In [0]:
y_pred_nn5 = model_nn5.predict(X_test)
mse_nn5 = mean_squared_error(y_test, y_pred_nn5)
print('Mean squared error: ', mse_nn5)

r2_nn5 = r2_score(y_test, y_pred_nn5)
print('r2 score: ', r2_nn5)

Mean squared error:  0.08150738651148945
r2 score:  0.9267357695532811


## MLP model 6
### In this model we change the activation function to 'sigmoid'

In [0]:
model_nn6 = Sequential()
model_nn6.add(Dense(100, input_dim=120, activation='sigmoid'))
model_nn6.add(Dense(1))
model_nn6.compile(optimizer='adam', loss='mse', metrics=['mse'])
model_nn6.fit(X_train, y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f2736d9ceb8>

In [0]:
y_pred_nn6 = model_nn6.predict(X_test)
mse_nn6 = mean_squared_error(y_test, y_pred_nn6)
print('Mean squared error: ', mse_nn6)

r2_nn6 = r2_score(y_test, y_pred_nn6)
print('r2 score: ', r2_nn6)

Mean squared error:  0.08456300210476338
r2 score:  0.9239891801389505


# Summary of the three models having window size = 120

In [0]:
lr_l5 = ["Linear Regression model 2", mse_reg, r2_reg]
mlp_l6 = ["MLP model 4", mse_nn4, r2_nn4]
mlp_l7 = ["MLP model 5", mse_nn5, r2_nn5]
mlp_l8 = ["MLP model 6", mse_nn6, r2_nn6]
data = [lr_l5, mlp_l6, mlp_l7, mlp_l8]
df = pd.DataFrame(data, columns = ['Model', 'Mean squared error', 'r2 score'])
df

Unnamed: 0,Model,Mean squared error,r2 score
0,Linear Regression model 2,0.065309,0.942926
1,MLP model 4,0.073954,0.933525
2,MLP model 5,0.081507,0.926736
3,MLP model 6,0.084563,0.923989


# Overall Summary
### Here the first four rows displays the results by the models using window size = 60 and the next four rows displays the results by the models using window size = 120. 

In [0]:
data = [lr_l1, mlp_l2, mlp_l3, mlp_l4, lr_l5, mlp_l6, mlp_l7, mlp_l8]
df = pd.DataFrame(data, columns = ['Model', 'Mean squared error', 'r2 score'])
df

Unnamed: 0,Model,Mean squared error,r2 score
0,Linear Regression model 1,0.065309,0.942926
1,MLP model 1,0.067433,0.941069
2,MLP model 2,0.069224,0.939504
3,MLP model 3,0.076219,0.931496
4,Linear Regression model 2,0.065309,0.942926
5,MLP model 4,0.073954,0.933525
6,MLP model 5,0.081507,0.926736
7,MLP model 6,0.084563,0.923989


## From the above summary chart, we can see that we got comparable results using different kinds of models. Among them, we choose ***linear regression model using window size = 60*** to proceed further in our experiment. 

# Predicting the missing values in the given file

In [0]:
window_size = 60
X2 = np.array([0] * (window_size + 1))

count = 1
while(count<=50000):
  row = random.randrange(len(X) - window_size - 1)
  temp = X[row:row+window_size+1]['data'].tolist()
  if('?' not in temp):
    temp = [float(item) for item in temp]
    X2 = np.vstack((X2,temp)) 
    count = count + 1

X2 = X2[1:,:]
print(X2)
print(X2.shape)

[[0.266 0.176 0.176 ... 0.08  0.126 0.158]
 [0.31  0.308 0.312 ... 0.22  0.22  0.22 ]
 [3.82  2.648 1.736 ... 1.614 1.638 1.684]
 ...
 [0.504 0.5   0.498 ... 0.958 0.958 0.954]
 [0.282 0.244 0.228 ... 2.552 2.542 2.534]
 [0.234 0.234 0.234 ... 0.308 0.256 0.256]]
(50000, 61)


## We train our data using window size of 60

In [0]:
X_train = X2[:,0:60]
y_train = X2[:,60]
print(X_train)
print(X_train.shape)
print(y_train)
print(y_train.shape)

[[0.266 0.176 0.176 ... 0.1   0.08  0.126]
 [0.31  0.308 0.312 ... 0.22  0.22  0.22 ]
 [3.82  2.648 1.736 ... 1.62  1.614 1.638]
 ...
 [0.504 0.5   0.498 ... 0.96  0.958 0.958]
 [0.282 0.244 0.228 ... 2.554 2.552 2.542]
 [0.234 0.234 0.234 ... 0.328 0.308 0.256]]
(50000, 60)
[0.158 0.22  1.684 ... 0.954 2.534 0.256]
(50000,)


## Linear Regression model

In [0]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## We predict the labels of the missing values. The list of all the predicted values are printed below.

In [0]:
X1 = X
X1 = np.array(X['data'])
count = 0
predictions = []
for i in range(len(data)):
  temp = X1[i]
  if(temp == '?'):
    if(i < window_size):
      pred1 = np.nanmean()
      X1[i] = pred1
    else:
      temp1 = np.array(X1[i-60:i].tolist())
      temp1 = temp1.astype('float32')
      test = temp1.reshape((1,-1))
      pred1 = model.predict(test)
      predictions.append(pred1[0])
      X1[i] = pred1[0]
    count = count + 1

In [0]:
print(predictions)
print(len(predictions))

[0.2768042389679501, 0.32089765062589926, 6.153626816158928, 6.115620740789014, 3.231294899054876, 2.1335377186330016, 3.013238816134887, 2.9904687458780077, 0.39204437477491433, 0.5155145427752569, 0.5647864405399678, 0.6122453734926284, 0.6427281147004584, 0.6617368431108573, 0.6850599628552546, 0.7180394884217727, 0.752222540878565, 0.7800428121796706, 0.7947276416472373, 0.8089583331485553, 0.8130786193870977, 0.8192605887645726, 0.842733329780705, 0.8610526049204339, 0.8749344938117634, 0.8827708530485339, 0.889459978451874, 0.8821703393476223, 0.8879365875575768, 0.890112706396939, 0.8991467249768932, 0.9194919368992822, 0.9021588673766029, 0.9096668098414017, 0.907504811852284, 0.9104614117739431, 0.9076279643993963, 0.900523214147622, 0.9160958858751496, 0.9246171971578058, 0.9308171578278461, 0.9302217584077267, 0.9205899883530174, 0.9239210227530071, 0.9261668849616035, 0.9288523489457436, 0.9269597257242667, 0.92707542538167, 0.9380742742880156, 0.9487109051370436, 0.9495237