In [None]:
#load libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf, pacf

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

print("Setup Complete")

In [None]:
df = pd.read_csv('../input/air-passengers/AirPassengers.csv')

In [None]:
#print the first 5 and last 5 rows of the data
df

In [None]:
#get the summary of the data
df.info()

We can see that our data has only 2 columns i.e Month column and #Passengers column. We have a total of 144 entries and our data does not contain missing values. The Month column is object datatype so it will be imposible to use the model on the data. We will change the object type to datetime.

In [None]:
from datetime import datetime

In [None]:
#Parse strings to datetime type
df['Month'] = pd.to_datetime(df['Month'], infer_datetime_format = True)

In [None]:
dfin = df.set_index('Month',inplace=False)

In [None]:
dfin

In [None]:
dfin.info()

Month column has been successfully changed to datetime and set as the index.

# Data Visualization
We will plot a graph to see how the data is distributed through time.

In [None]:
#plot graph
plt.xlabel("Date")
plt.ylabel("Number of Air Passengers")
plt.plot(dfin)

Just by looking at the plot, we can see that there is an increase in the number of air passengers form the year 1950 to 1960. Therefore, the data is not stationary.


# Stationarity
We will use Dickey-Fuller test to check for stationarity

In [None]:
#Determining rolling statistics
rolmean = dfin.rolling(window=12).mean()
rolstd = dfin.rolling(window=12).std()
print(rolmean, rolstd)

In [None]:
def test_stationarity(timeseries):
    
    #Determine rolling statistics
    movingaverage = timeseries.rolling(window=12).mean()
    movingstd = timeseries.rolling(window=12).std()
    
    #Plot rolling statistics
    plt.plot(timeseries, color='blue', label='Original')
    plt.plot(movingaverage, color='red', label='Rolling Mean')
    plt.plot(movingstd, color='green', label='Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey–Fuller test:
    print('Dickey Fuller Test Results:')
    adftest = adfuller(timeseries['#Passengers'], autolag='AIC')
    adfresult = pd.Series(adftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in adftest[4].items():
        adfresult['Critical Value (%s)'%key] = value
    print(adfresult)

In [None]:
test_stationarity(dfin)

The plot shows our original data in blue, the Rolling Mean in red and the Rolling Std in green. The Rolling Mean has a clearly upward trend whereas the Rolling Standard Deviation has a slightly upward trend. We could also say the rolling Standard Deviation is slightly constant. 

The p-value is low, although larger than the Test Statistic, so we can reject the null hypothesis of stationarity to say that our data is not stationary.

# Transformations
We will adjust our data to achieve simpler forecasting. We do this to simplify the patterns in the historical data by removing known sources of variation and try try to make the pattern constant across the timeseries. This will lead to a more accurate forecast.

To achieve Stationarity, we will transform our data through the log transformation. We do this to remove the trend component.

In [None]:
dfin_log = np.log(dfin)
plt.plot(dfin_log)

The value of y has been changed to log values but the value of x remains the same.

Now we will calculate the rolling statistics  with the same window of 12. 

In [None]:
rolmean_log = dfin_log.rolling(window=12).mean()
rolstd_log = dfin_log.rolling(window=12).std()

In [None]:
plt.plot(dfin_log, color='blue', label='Original')
plt.plot(rolmean_log, color='red', label='Rolling Mean')
plt.plot(rolstd_log, color='green', label='Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation (Log Scale)')

The time series as well as the rolling mean have a trend component. We will subtract the rolling mean scale from the time series in order to remove the trend component.

In [None]:
newdfin = dfin_log - rolmean_log
newdfin



We will remove the NaN values. 

In [None]:
newdfin.dropna(inplace=True)

In [None]:
newdfin

In [None]:
test_stationarity(newdfin)

Subtracting the rolling mean scale from the time series removed the trend component. We can see that the data is now stationary. The p-value has been reduced from 0.99 to 0.022, and the test statistic value is almost similar to the critical values.

In [None]:
#time shift transformation
dfin_log_diff = dfin_log - dfin_log.shift()
plt.plot(dfin_log_diff)

In [None]:
dfin_log_diff

In [None]:
#remove the Nan values
dfin_log_diff.dropna(inplace=True)
plt.plot(dfin_log_diff)

In [None]:
dfin_log_diff

In [None]:
test_stationarity(dfin_log_diff)

The above plot appears to be the best series. The rolling mean and the rolling standard deviation are fairly constant.

The Dickey-Fuller test however has a p-value of 0.07 which is higher than the previous 0.02. Test Statistic value is not as close to the critical values as well.

We will now break down the 3 components of the log scale series namely: 
* trend
* seasonality
* residuals

We will use a system libary function. Once separated, we can ignore trend & seasonality and check on the nature of the residuals.

In [None]:
decomposition = seasonal_decompose(dfin_log)

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

plt.subplot(411)
plt.plot(dfin_log, label='Original')
plt.legend(loc='best')

plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')

plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')

plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()

In [None]:
dfin_decompose = residual
dfin_decompose.dropna(inplace=True)

In [None]:
rollingmean_decompose = dfin_decompose.rolling(window=12).mean()
rollingstd_decompose = dfin_decompose.rolling(window=12).std()

plt.plot(dfin_decompose, color='blue', label='Original')
plt.plot(rollingmean_decompose, color='red', label='Rolling Mean')
plt.plot(rollingstd_decompose, color='green', label='Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')

# ACF and PACF

In [None]:
lag_acf = acf(dfin_log_diff, nlags=20)
lag_pacf = pacf(dfin_log_diff, nlags=20, method='ols')

In [None]:
#Plotting ACF:
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0, linestyle='--', color='red')
plt.axhline(y=-1.96/np.sqrt(len(dfin_log_diff)), linestyle='--', color='red')
plt.axhline(y=1.96/np.sqrt(len(dfin_log_diff)), linestyle='--', color='red')
plt.title('Autocorrelation Function')            

#Plotting PACF
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0, linestyle='--', color='red')
plt.axhline(y=-1.96/np.sqrt(len(dfin_log_diff)), linestyle='--', color='red')
plt.axhline(y=1.96/np.sqrt(len(dfin_log_diff)), linestyle='--', color='red')
plt.title('Partial Autocorrelation Function')
            
plt.tight_layout()

The ACF graph shows the curve touching y=0.0 line at x=2. Thus, from theory, Q = 2. The PACF graph shows the curve touching y=0.0 line at x=2. Thus, from theory, P = 2

ARIMA is AR + I + MA. Before implementing ARIMA model, let us check the results of the individual AR & MA model. These models will give a value of RSS. Lower the RSS values indicates a better model.


# AR Model
Making order = (2,1,0)

In [None]:
model1 = ARIMA(dfin_log, order=(2,1,0))
results_AR = model1.fit(disp=-1)
plt.plot(dfin_log_diff)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'%sum((results_AR.fittedvalues - dfin_log_diff['#Passengers'])**2))
print('AR model')

# MA Model
Making order = (0,1,2)

In [None]:
model2 = ARIMA(dfin_log, order=(0,1,2))
results_MA = model2.fit(disp=-1)
plt.plot(dfin_log_diff)
plt.plot(results_MA.fittedvalues, color='red')
plt.title('RSS: %.4f'%sum((results_MA.fittedvalues - dfin_log_diff['#Passengers'])**2))
print('MA model')

# ARIMA Model
Making order = (2,1,2)


In [None]:
model = ARIMA(dfin_log, order=(2,1,2))
results_ARIMA = model.fit(disp=-1)
plt.plot(dfin_log_diff)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.4f'%sum((results_ARIMA.fittedvalues - dfin_log_diff['#Passengers'])**2))
print('ARIMA model')

**RSS values**
* AR Model = 1.5023
* MA Model = 1.4721
* ARIMA Model = 1.0292

ARIMA Model is better than the individual AR and MA Models.

We will move on to generate the predictions but we need to reconvert the predictions back to original form first before we do prediction plots. This is because we built our model on log transformed data.

In [None]:
#Prediction and reverse transformation
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_diff.head()

In [None]:
predictions_ARIMA_diff_csum = predictions_ARIMA_diff.cumsum()
predictions_ARIMA_diff_csum.head()

In [None]:
predictions_ARIMA_log = pd.Series(dfin_log['#Passengers'].iloc[0], index=dfin_log.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_csum, fill_value=0)
predictions_ARIMA_log.head()

In [None]:
#inverse of log is exp
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.plot(dfin)
plt.plot(predictions_ARIMA)

From above plot, we can see that our predicted forecasts are very close to the real time series values. It also indicates a fairly accurate model.

In [None]:
dfin_log.head()



We have 144 (existing data of 12 yrs in months) data points. Now, we want to forecast for additional 10 yrs (10x12 months=120 data points).

144+120 = 264 records/data points


In [None]:
results_ARIMA.plot_predict(1,264)