In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [48]:
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf,pacf
from statsmodels.tsa.arima_model import ARIMA

In [None]:
data = pd.read_csv("AirPassengers.csv")

In [None]:
#EDA on time series data
data.head()

In [51]:
data.tail()

Unnamed: 0,Month,#Passengers
139,1960-08,606
140,1960-09,508
141,1960-10,461
142,1960-11,390
143,1960-12,432


In [52]:
data.shape

(144, 2)

In [53]:
data.describe()

Unnamed: 0,#Passengers
count,144.0
mean,280.298611
std,119.966317
min,104.0
25%,180.0
50%,265.5
75%,360.5
max,622.0


In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Month        144 non-null    object
 1   #Passengers  144 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ KB


In [55]:
data['Month'] = pd.to_datetime(data['Month'])
data.head()

Unnamed: 0,Month,#Passengers
0,1949-01-01,112
1,1949-02-01,118
2,1949-03-01,132
3,1949-04-01,129
4,1949-05-01,121


In [None]:
data.info()

In [None]:
data.index = data['Month']

data.head()

In [None]:
del data['Month']

In [None]:
data.head()

1. month column should be in appropriate data type
2. month should be index

In [None]:
#plotting the time series
data.plot()

In [None]:
# seasonal decompose

from statsmodels.tsa.seasonal import seasonal_decompose

decompose_result = seasonal_decompose(data['#Passengers'].dropna())

decompose_result.plot()

In [None]:
# how to make time series stationary

mean_log = data.rolling(window = 12).mean()

In [None]:
plt.plot(data,color = 'blue',label = 'Original')
plt.plot(mean_log,color = 'red',label = 'Rolling Mean')
plt.legend(loc = 'best')
plt.title('rolling mean vs original')

 before building a time series model, we need to make sure that the time series is stationary.If the time series is non-stationary, then we need to make it stationary by differencing the data.
    
    the number of times we take a difference of the data is a parameter used in Arima models.
    
    non-stationarity in time series may appear for the following reasons:
        - presence of a trend in the data,presence of heteroscadasticity,presence of autocorrelation, we can identify non-stationarity in time series by performing a statistical test Augmented Dickey-Fuller test.
        
        null hypothesis: time series is non-stationary
        alternative hypothesis: time series is stationary

In [None]:
# check the stationarity of the series

from statsmodels.tsa.stattools import adfuller
result = adfuller(data['#Passengers'])
print(result[1])

In [None]:
# if p value is less than 0.05 , then time series is stationary

In [None]:
data['#Passengers'].diff()

In [None]:
data['#Passengers'].head()

In [None]:
result = adfuller(data['#Passengers'].diff().dropna())
print(result[1])

In [None]:
result = adfuller(data['#Passengers'].diff().diff().dropna())
print(result[1])

In [None]:
data['#Passengers'].diff().dropna().plot()

In [None]:
data['#Passengers'].diff().diff().dropna().plot()

In [None]:
new_data = pd.DataFrame(data['#Passengers'].diff().dropna())

In [None]:
mean_log = new_data.rolling(window = 12).mean()

In [None]:
plt.plot(new_data,color = 'blue',label = 'Original')
plt.plot(mean_log,color = 'red',label = 'Rolling Mean')
plt.legend(loc = 'best')
plt.title('rolling mean vs original')

In [None]:
# seasonal decompose

from statsmodels.tsa.seasonal import seasonal_decompose

decompose_result = seasonal_decompose(new_data['#Passengers'].dropna())

decompose_result.plot()

In [None]:
# ACF and PACF plots are used to identify the model's order in ARIMA models.These plots help to find the parameters p and q.
#   Also, 
# we always plot the ACF and PACF plots after making the timeseries stationary.

In [None]:
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.stattools import pacf
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf

acf_plot = acf(new_data)
pacf_plot = pacf(new_data)
plot_acf(acf_plot)

In [None]:
plot_pacf(pacf_plot,lags = 10 )

In [None]:
from statsmodels.tsa.arima.model import ARIMA

train = new_data.iloc[:120]['#Passengers']
test = new_data.iloc[121:]['#Passengers']

model = ARIMA(train,order=(1,0,2)) #p=1,d = 0,q=2
model_fit = model.fit()

In [None]:
model_fit.summary()

In [None]:
len(train)+len(test)-1

In [None]:
new_data['predict'] = model_fit.predict(start = 121,end=len(train)+len(test)-1,dynamic=True)

new_data[['#Passengers','predict']]

In [None]:
new_data[['#Passengers','predict']].plot()

In [None]:
#model is not taking into account seasonality

# so this model is not a good fit for this data

# so we can try for sarimax modelling



In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX,SARIMAXResults

model = SARIMAX(train,seasonal_order =(1,0,2,12))
model = model.fit()

In [None]:
new_data['predict']=model.predict(start = 121,end = len(train)+len(test)-1,dynamic = True)

new_data[['#Passengers','predict']].plot()

In [None]:
new_data.tail(20)

In [None]:
#predicting the projection for next 5 years

forecast= model.forecast(steps=120)
new_data.plot()
forecast.plot()

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,mean_absolute_percentage_error

predict_df = new_data[121:131]
mean_squared_error(predict_df['#Passengers'],predict_df['predict'])

In [None]:
mean_absolute_percentage_error(predict_df['#Passengers'],predict_df['predict'])