### Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from pylab import rcParams
from statsmodels.tsa.stattools import adfuller
from fbprophet import Prophet

### Loading the Data

In [None]:
df = pd.read_csv('household_power_consumption.csv', na_values = ['nan', '?'],
                 low_memory=False, infer_datetime_format=True,
                 parse_dates={'datetime':[0,1]}, index_col=['datetime'])

### Preprocessing

#### filling missing values

In [None]:
print(df.isnull().sum())
df.fillna(df.mean(),inplace=True)
df.isnull().sum()

In [None]:
df

#### Down-Sampling to Daily(Aggregating Minute Fields)

In [None]:
new_df = df.resample('D').sum()

In [None]:
plt.figure(figsize=(5, 3))
new_df.Global_active_power.plot()
plt.show()
plt.figure(figsize=(5, 3))
new_df.Global_reactive_power.plot(color='g')
plt.show()
plt.figure(figsize=(5, 3))
new_df.Voltage.plot(color='b')
plt.show()
plt.figure(figsize=(5, 3))
new_df.Global_intensity.plot(color='y')
plt.show()
plt.figure(figsize=(5, 3))
new_df.Sub_metering_1.plot(color='black')
plt.show()
plt.figure(figsize=(5, 3))
new_df.Sub_metering_2.plot(color='r')
plt.show()
plt.figure(figsize=(5, 3))
new_df.Sub_metering_3.plot(color='b')
plt.show()

In [None]:
from scipy.stats import pearsonr
pear = pearsonr(df['Global_active_power'], df['Global_reactive_power'])
print(pear)

In [None]:
pear = pearsonr(df['Voltage'],  df['Global_intensity'])
print(pear)

### Checking if the Time Series Data is Stationary or not

In [None]:
print('Global_active_power')
adf_result = adfuller(new_df.Global_active_power)
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('No. of lags used:', adf_result[2])
print('No. of observations used :', adf_result[3])
if adf_result[1] < 0.05:
    print('TSD is Stationary')
else:
    print('TSD is not Stationary')
print( 'Critical Values:' )
for k, v in adf_result[4].items():
    print( f' {k} : {v} ' )

print('-------------------------------------------------------')
print('Global_reactive_power')
adf_result = adfuller(new_df.Global_reactive_power)
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('No. of lags used:', adf_result[2])
print('No. of observations used :', adf_result[3])
if adf_result[1] < 0.05:
    print('TSD is Stationary')
else:
    print('TSD is not Stationary')
print( 'Critical Values:' )
for k, v in adf_result[4].items():
    print( f' {k} : {v} ' )

print('-------------------------------------------------------')
print('Global_intensity')
adf_result = adfuller(new_df.Global_intensity)
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('No. of lags used:', adf_result[2])
print('No. of observations used :', adf_result[3])
if adf_result[1] < 0.05:
    print('TSD is Stationary')
else:
    print('TSD is not Stationary')
print( 'Critical Values:' )
for k, v in adf_result[4].items():
    print( f' {k} : {v} ' )

In [None]:
print('Voltage')
adf_result = adfuller(new_df.Voltage)
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('No. of lags used:', adf_result[2])
print('No. of observations used :', adf_result[3])
if adf_result[1] < 0.05:
    print('TSD is Stationary')
else:
    print('TSD is not Stationary')
print( 'Critical Values:' )
for k, v in adf_result[4].items():
    print( f' {k} : {v} ' )

print('-------------------------------------------------------')
print('Sub_metering_1')
adf_result = adfuller(new_df.Sub_metering_1)
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('No. of lags used:', adf_result[2])
print('No. of observations used :', adf_result[3])
if adf_result[1] < 0.05:
    print('TSD is Stationary')
else:
    print('TSD is not Stationary')
print( 'Critical Values:' )
for k, v in adf_result[4].items():
    print( f' {k} : {v} ' )

print('-------------------------------------------------------')
print('Sub_metering_2')
adf_result = adfuller(new_df.Sub_metering_2)
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('No. of lags used:', adf_result[2])
print('No. of observations used :', adf_result[3])
if adf_result[1] < 0.05:
    print('TSD is Stationary')
else:
    print('TSD is not Stationary')
print( 'Critical Values:' )
for k, v in adf_result[4].items():
    print( f' {k} : {v} ' )

In [None]:
print('-------------------------------------------------------')
print('Sub_metering_3')
adf_result = adfuller(new_df.Sub_metering_3)
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('No. of lags used:', adf_result[2])
print('No. of observations used :', adf_result[3])
if adf_result[1] < 0.05:
    print('TSD is Stationary')
else:
    print('TSD is not Stationary')
print( 'Critical Values:' )
for k, v in adf_result[4].items():
    print( f' {k} : {v} ' )

All TSDs are Stationary

### Creating the Model

In [None]:
new_df.reset_index(inplace=True)
prohp_df = new_df[['datetime', 'Global_active_power']]

prohp_df

#### Train Test Split

In [None]:
train_df = prohp_df.iloc[:1077, :]
test_df = prohp_df.iloc[1077:, :]
test_df

In [None]:
model = Prophet()
model.fit(train_df)
forecast = model.predict(test_df)

#### Calculating Metrics

In [None]:
# Mean Absolute Percentage Error
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
mape = mean_absolute_percentage_error(test_df, forecast)
print('%.2f'%mape, 'MAPE')

# Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(np.array(test_df), np.array(forecast)))
print('%.2f'%rmse, 'RMSE')

In [None]:
model.plot_components(forecast)

August has the lowest value

#### Multivariate Train test split

In [None]:
multi = new_df.rename(columns = {'datetime':'ds','Global_active_power':'y','Global_reactive_power':'add1',
                                                'Voltage':'add2','Global_intensity':'add3','Sub_metering_1':'add4',
                                                'Sub_metering_2':'add5','Sub_metering_3':'add6'})

In [None]:
multi_train_df = multi.iloc[:1077,:]
multi_test_df = multi.iloc[1077:,:]
multi_test_df

In [None]:
model_2 = Prophet()
model_2.add_regressor('add1')
model_2.add_regressor('add2')
model_2.add_regressor('add3')
model_2.add_regressor('add4')
model_2.add_regressor('add5')
model_2.add_regressor('add6')

model_2.fit(multi_train_df)

In [None]:
# making forecasts
forecasts = model_2.predict(multi_test_df)

#### Metrics

In [None]:
# Mean Absolute Percentage Error
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
mape = mean_absolute_percentage_error(multi_test_df, forecasts)
print('%.2f'%mape, 'MAPE')

# Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(np.array(multi_test_df), np.array(forecasts)))
print('%.2f'%rmse, 'RMSE')

In [None]:
model_2.plot_components(forecasts)

Tuesday has the lowest value