In [25]:
# read csv file
import pandas as pd
data = pd.read_csv('AirPassengers.csv')
data['Month'] = pd.to_datetime(data['Month'])
data.head()

Unnamed: 0,Month,#Passengers
0,1949-01-01,112
1,1949-02-01,118
2,1949-03-01,132
3,1949-04-01,129
4,1949-05-01,121


In [26]:
import datetime as dt

In [27]:
data['Mois'] = data['Month'].dt.month 

In [28]:
data

Unnamed: 0,Month,#Passengers,Mois
0,1949-01-01,112,1
1,1949-02-01,118,2
2,1949-03-01,132,3
3,1949-04-01,129,4
4,1949-05-01,121,5
...,...,...,...
139,1960-08-01,606,8
140,1960-09-01,508,9
141,1960-10-01,461,10
142,1960-11-01,390,11


In [29]:
# create 12 month moving average
data['MA12'] = data['#Passengers'].rolling(12).mean()
data['MA10'] = data['#Passengers'].rolling(10).mean()

# plot the data and MA
import plotly.express as px
fig = px.line(data, x="Month", y=["#Passengers", "MA12","MA10"], template = 'plotly_white')
fig.show()

In [30]:
import numpy as np

In [31]:
# extract month and year from dates
data['Month_'] = [i.month for i in data['Month']]
data['Year'] = [i.year for i in data['Month']]

In [32]:
# create a sequence of numbers
data['Series'] = np.arange(1,len(data)+1)

In [33]:
# drop unnecessary columns and re-arrange
data.drop(['Month', 'MA12'], axis=1, inplace=True)
data = data[['Series', 'Year', 'Month_', '#Passengers']] 

In [34]:
# check the head of the dataset
data.head()

Unnamed: 0,Series,Year,Month_,#Passengers
0,1,1949,1,112
1,2,1949,2,118
2,3,1949,3,132
3,4,1949,4,129
4,5,1949,5,121


In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Series       144 non-null    int32
 1   Year         144 non-null    int64
 2   Month_       144 non-null    int64
 3   #Passengers  144 non-null    int64
dtypes: int32(1), int64(3)
memory usage: 4.1 KB


In [36]:
# split data into train-test set
train = data[data['Year'] < 1960]
test = data[data['Year'] >= 1960]
# check shape
train.shape, test.shape

((132, 4), (12, 4))

In [37]:
# import the regression module
from pycaret.regression import *
# initialize setup
s = setup(data = train, 
          test_data = test, 
          target = '#Passengers', 
          fold_strategy = 'timeseries', 
          numeric_features = ['Year', 'Series'], 
          fold = 3, 
          transform_target = True, 
          session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,#Passengers
2,Original Data,"(132, 4)"
3,Missing Values,False
4,Numeric Features,2
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(132, 13)"


In [18]:
best = compare_models(sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lar,Least Angle Regression,22.398,923.8651,28.2855,0.5621,0.0878,0.0746,0.02
lr,Linear Regression,22.3981,923.8749,28.2856,0.5621,0.0878,0.0746,1.2533
huber,Huber Regressor,22.4274,892.3078,27.9491,0.5981,0.088,0.0749,0.0433
br,Bayesian Ridge,22.4783,932.2165,28.5483,0.5611,0.0884,0.0746,0.0233
ridge,Ridge Regression,23.1976,1003.9426,30.041,0.5258,0.0933,0.0764,0.88
lasso,Lasso Regression,38.4188,2413.5108,46.8468,0.0882,0.1473,0.1241,1.0833
en,Elastic Net,40.6486,2618.8761,49.4048,-0.0824,0.1563,0.1349,0.0133
omp,Orthogonal Matching Pursuit,44.3054,3048.2658,53.8613,-0.4499,0.1713,0.152,0.0167
gbr,Gradient Boosting Regressor,50.1217,4032.0567,61.2306,-0.6189,0.2034,0.1538,0.03
rf,Random Forest Regressor,52.3637,4647.0635,65.2883,-0.7726,0.2131,0.1578,0.11


In [19]:
prediction_holdout = predict_model(best)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Least Angle Regression,25.0714,972.2733,31.1813,0.8245,0.0692,0.0571


In [20]:
# generate predictions on the original dataset
predictions = predict_model(best, data=data)

In [21]:
# add a date column in the dataset
predictions['Date'] = pd.date_range(start='1949-01-01', end = '1960-12-01', freq = 'MS')

In [22]:
# line plot
fig = px.line(predictions, x='Date', y=["#Passengers", "Label"], template = 'plotly_white')

In [23]:
# add a vertical rectange for test-set separation
fig.add_vrect(x0="1960-01-01", x1="1960-12-01", fillcolor="grey", opacity=0.25, line_width=0)
fig.show()

In [31]:
final_best = finalize_model(best)

In [34]:
future_dates = pd.date_range(start = '1961-01-01', end = '1965-01-01', freq = 'MS')
future_df = pd.DataFrame()
future_df['Month_'] = [i.month for i in future_dates]
future_df['Year'] = [i.year for i in future_dates]    
future_df['Series'] = np.arange(145,(145+len(future_dates)))
future_df.head()

Unnamed: 0,Month_,Year,Series
0,1,1961,145
1,2,1961,146
2,3,1961,147
3,4,1961,148
4,5,1961,149


In [35]:
predictions_future = predict_model(final_best, data=future_df)
predictions_future.head()

Unnamed: 0,Month_,Year,Series,Label
0,1,1961,145,486.278268
1,2,1961,146,482.208187
2,3,1961,147,550.485967
3,4,1961,148,535.187177
4,5,1961,149,538.923789


In [39]:
concat_df = pd.concat([data,predictions_future], axis=0)
concat_df_i = pd.date_range(start='1949-01-01', end = '1965-01-01', freq = 'MS')
concat_df.set_index(concat_df_i, inplace=True)
fig = px.line(concat_df, x=concat_df.index, y=["#Passengers", "Label"], template = 'plotly_white')
fig.show()