In [None]:
import pandas as pd
import numpy as np

!unzip /content/drive/MyDrive/household_power_consumption.zip

In [None]:
df = pd.read_csv('household_power_consumption.txt', sep=';', low_memory=False, infer_datetime_format=True,
                 parse_dates={'datetime':[0,1]}, index_col=['datetime'])

In [None]:
df.head(5)

In [None]:
df.isna().sum()

In [None]:
df.replace('?', np.nan, inplace=True)

In [None]:
df = df.astype('float64')

In [None]:
y = df.Global_active_power.copy()

#### Question 11

Question 11
Resample the data from a minute sampling rate to a daily sampling rate (i.e sum over the minutes for each day). Also, fill all the missing values with the mean (average) of their attribute/column. Which of the following is a plot of the daily global_active_power over time?

In [None]:
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
df_resampled = df.resample('D').sum()
df_resampled = df_resampled.fillna('mean')
plt.figure(figsize=(10, 10))
plt.plot(df_resampled.index, df_resampled.Global_active_power,)
plt.show()

#### Question 12

What is the Pearson correlation coefficient between the global_active_power and global_reactive_power? To 2 decimal places.

In [None]:
corr = df_resampled['Global_active_power'].corr(df_resampled['Global_reactive_power'], method='pearson')
round(corr, 2)

#### Question 13

What is the Pearson correlation coefficient between the Voltage and global_intensity? To 2 decimal places

In [None]:
corr2 = df_resampled['Global_intensity'].corr(df_resampled['Voltage'])
round(corr2, 2)

#### Question 14

Using the daily sampling rate (sum), divide the data into a train and test set. The last 365 days is your test set and the first (x-365) days is your training set. Where x is the length of the dataset. Use Facebook Prophet to train a Univariate time series modeling using this time column (‘dt’ or ‘ds’) and the global_active_power (or ‘y’). Answer question 14 - 16

Evaluating the results of your time series modeling on the test set, what is the MAPE (in %) in 2 decimal places?

In [None]:
len(df_resampled)

In [None]:
1442-365

In [None]:
train = df_resampled.iloc[:1077, :]
test = df_resampled.iloc[:365, :]
train.shape, test.shape

In [None]:
from fbprophet import Prophet

In [None]:
train.reset_index(inplace = True)
train.head()

In [None]:
test.reset_index(inplace=True)

In [None]:
X_train = train['datetime'].copy()
y_train = train['Global_active_power'].copy()
X_test = test['datetime'].copy()
y_test = test['Global_active_power'].copy()

In [None]:
model = Prophet()
df_train = pd.DataFrame(data=list(zip(X_train, y_train)), columns=['ds', 'y']) 
model.fit(df_train)

In [None]:
df_test = pd.DataFrame(data=list(zip(X_test, y_test)), columns=['ds', 'y']) 
forecast = model.predict()

In [None]:
def Metric(y_true, y_pred):
  y_true, y_pred = np.array(y_true), np.array(y_pred)
  return np.mean(np.abs((y_true - y_pred)  / y_true)) * 100

In [None]:
MAPE = Metric(df_test[''], forecast['yhat'])
round(MAPE, 2)

#### Question 15

What is the RMSE in 2 decimal places?

In [None]:
from sklearn import metrics
round(np.sqrt(metrics.mean_squared_error(df_test['y'],forecast['yhat'])), 2)

In [None]:
model.plot_components(forecast)

#### Question 17

Multivariate Time Series Forecasting with Facebook Prophet

In the last exercise, we used only the dependent variable (Global_Active_Power) and the time component for our modeling (ds vs y). Next, we will build a time series model using the other variables. These variables will be added to the forecast model as a regressor on Facebook Prophet. So the six independent variables ['Global_reactive_power', 'Voltage','Global_intensity', 'Sub_metering_1','Sub_metering_2','Sub_metering_3'] will be [‘add1’, ‘add2’, ‘add3’, ‘add4’, ‘add5’, ‘add6’] as the regressors. Split the data into train and test as done above and build a multivariate forecast model to forecast the last 365 days of the house consumption (‘global_active_power’). Answer question 17-19:

Evaluating the results of your time series modeling on the test set, what is the MAPE (in %) in 2 decimal places?

In [None]:
df_resampled.reset_index(inplace=True)
new_df = df_resampled.rename(columns = {'datetime':'ds','Global_active_power':'y',
                                           'Global_reactive_power':'add1','Voltage':'add2',
                                           'Global_intensity':'add3','Sub_metering_1':'add4',
                                          'Sub_metering_2':'add5','Sub_metering_3':'add6'})

In [None]:
Train = new_df.iloc[:1077,:]
Test = new_df.iloc[1077:,:]
Test.shape, Train.shape

In [None]:
model = Prophet()
model.add_regressor('add1')
model.add_regressor('add2')
model.add_regressor('add3')
model.add_regressor('add4')
model.add_regressor('add5')
model.add_regressor('add6')

In [None]:
model = model.fit(Train)

In [None]:
prediction = model.predict(Test)

In [None]:
round(Metric(Test['y'], prediction['yhat']), 2)

In [None]:
round(np.sqrt(metrics.mean_squared_error(Test['y'],prediction['yhat'])), 2)

In [None]:
model.plot_components(prediction)