### Dependencies

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from pylab import rcParams
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

### Loading the Data

In [3]:
df = pd.read_csv('household_power_consumption.csv', na_values = ['nan', '?'],
                 low_memory=False, infer_datetime_format=True,
                 parse_dates={'datetime':[0,1]}, index_col=['datetime'])

### Preprocessing

#### filling missing values

In [4]:
print(df.isnull().sum())
df.fillna(df.mean(),inplace=True)
df.isnull().sum()

Global_active_power      25979
Global_reactive_power    25979
Voltage                  25979
Global_intensity         25979
Sub_metering_1           25979
Sub_metering_2           25979
Sub_metering_3           25979
dtype: int64


Global_active_power      0
Global_reactive_power    0
Voltage                  0
Global_intensity         0
Sub_metering_1           0
Sub_metering_2           0
Sub_metering_3           0
dtype: int64

In [5]:
df

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.360,0.436,233.63,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0
...,...,...,...,...,...,...,...
2010-11-26 20:58:00,0.946,0.000,240.43,4.0,0.0,0.0,0.0
2010-11-26 20:59:00,0.944,0.000,240.00,4.0,0.0,0.0,0.0
2010-11-26 21:00:00,0.938,0.000,239.82,3.8,0.0,0.0,0.0
2010-11-26 21:01:00,0.934,0.000,239.70,3.8,0.0,0.0,0.0


#### Down-Sampling to Daily(Aggregating Minute Fields)

In [6]:
new_df = df.resample('D').sum()

In [None]:
plt.figure(figsize=(5, 3))
new_df.Global_active_power.plot()
plt.show()
plt.figure(figsize=(5, 3))
new_df.Global_reactive_power.plot(color='g')
plt.show()
plt.figure(figsize=(5, 3))
new_df.Voltage.plot(color='b')
plt.show()
plt.figure(figsize=(5, 3))
new_df.Global_intensity.plot(color='y')
plt.show()
plt.figure(figsize=(5, 3))
new_df.Sub_metering_1.plot(color='black')
plt.show()
plt.figure(figsize=(5, 3))
new_df.Sub_metering_2.plot(color='r')
plt.show()
plt.figure(figsize=(5, 3))
new_df.Sub_metering_3.plot(color='b')
plt.show()

### Checking if the Time Series Data is Stationary or not

In [31]:
print('Global_active_power')
adf_result = adfuller(new_df.Global_active_power)
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('No. of lags used:', adf_result[2])
print('No. of observations used :', adf_result[3])
if adf_result[1] < 0.05:
    print('TSD is Stationary')
else:
    print('TSD is not Stationary')
print( 'Critical Values:' )
for k, v in adf_result[4].items():
    print( f' {k} : {v} ' )

print('-------------------------------------------------------')
print('Global_reactive_power')
adf_result = adfuller(new_df.Global_reactive_power)
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('No. of lags used:', adf_result[2])
print('No. of observations used :', adf_result[3])
if adf_result[1] < 0.05:
    print('TSD is Stationary')
else:
    print('TSD is not Stationary')
print( 'Critical Values:' )
for k, v in adf_result[4].items():
    print( f' {k} : {v} ' )

print('-------------------------------------------------------')
print('Global_intensity')
adf_result = adfuller(new_df.Global_intensity)
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('No. of lags used:', adf_result[2])
print('No. of observations used :', adf_result[3])
if adf_result[1] < 0.05:
    print('TSD is Stationary')
else:
    print('TSD is not Stationary')
print( 'Critical Values:' )
for k, v in adf_result[4].items():
    print( f' {k} : {v} ' )

Global_active_power
ADF Statistic: -3.7317038452674294
p-value: 0.0036886778465987035
No. of lags used: 22
No. of observations used : 1419
TSD is Stationary
Critical Values:
 1% : -3.434966750462565 
 5% : -2.8635789736973725 
 10% : -2.5678555388041384 
-------------------------------------------------------
Global_reactive_power
ADF Statistic: -3.825850878287873
p-value: 0.0026536286982087025
No. of lags used: 22
No. of observations used : 1419
TSD is Stationary
Critical Values:
 1% : -3.434966750462565 
 5% : -2.8635789736973725 
 10% : -2.5678555388041384 
-------------------------------------------------------
Global_intensity
ADF Statistic: -3.8315901949388453
p-value: 0.002600148450485822
No. of lags used: 22
No. of observations used : 1419
TSD is Stationary
Critical Values:
 1% : -3.434966750462565 
 5% : -2.8635789736973725 
 10% : -2.5678555388041384 


In [32]:
print('Voltage')
adf_result = adfuller(new_df.Voltage)
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('No. of lags used:', adf_result[2])
print('No. of observations used :', adf_result[3])
if adf_result[1] < 0.05:
    print('TSD is Stationary')
else:
    print('TSD is not Stationary')
print( 'Critical Values:' )
for k, v in adf_result[4].items():
    print( f' {k} : {v} ' )

print('-------------------------------------------------------')
print('Sub_metering_1')
adf_result = adfuller(new_df.Sub_metering_1)
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('No. of lags used:', adf_result[2])
print('No. of observations used :', adf_result[3])
if adf_result[1] < 0.05:
    print('TSD is Stationary')
else:
    print('TSD is not Stationary')
print( 'Critical Values:' )
for k, v in adf_result[4].items():
    print( f' {k} : {v} ' )

print('-------------------------------------------------------')
print('Sub_metering_2')
adf_result = adfuller(new_df.Sub_metering_2)
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('No. of lags used:', adf_result[2])
print('No. of observations used :', adf_result[3])
if adf_result[1] < 0.05:
    print('TSD is Stationary')
else:
    print('TSD is not Stationary')
print( 'Critical Values:' )
for k, v in adf_result[4].items():
    print( f' {k} : {v} ' )

Voltage
ADF Statistic: -2.959236896287508
p-value: 0.03888052099731732
No. of lags used: 23
No. of observations used : 1418
TSD is Stationary
Critical Values:
 1% : -3.4349700122033804 
 5% : -2.8635804131233096 
 10% : -2.567856305330816 
-------------------------------------------------------
Sub_metering_1
ADF Statistic: -5.467586362135142
p-value: 2.439170122515477e-06
No. of lags used: 22
No. of observations used : 1419
TSD is Stationary
Critical Values:
 1% : -3.434966750462565 
 5% : -2.8635789736973725 
 10% : -2.5678555388041384 
-------------------------------------------------------
Sub_metering_2
ADF Statistic: -5.08256688459692
p-value: 1.5195803709975897e-05
No. of lags used: 20
No. of observations used : 1421
TSD is Stationary
Critical Values:
 1% : -3.4349602407782758 
 5% : -2.8635761009296763 
 10% : -2.5678540089914974 


In [33]:
print('-------------------------------------------------------')
print('Sub_metering_3')
adf_result = adfuller(new_df.Sub_metering_3)
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('No. of lags used:', adf_result[2])
print('No. of observations used :', adf_result[3])
if adf_result[1] < 0.05:
    print('TSD is Stationary')
else:
    print('TSD is not Stationary')
print( 'Critical Values:' )
for k, v in adf_result[4].items():
    print( f' {k} : {v} ' )

-------------------------------------------------------
Sub_metering_3
ADF Statistic: -4.062034056784644
p-value: 0.001118342219387697
No. of lags used: 20
No. of observations used : 1421
TSD is Stationary
Critical Values:
 1% : -3.4349602407782758 
 5% : -2.8635761009296763 
 10% : -2.5678540089914974 


All TSDs are Stationary

### Creating the Model