In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import SimpleExpSmoothing, ExponentialSmoothing
from sklearn.metrics import mean_absolute_error, mean_squared_error
from google.colab import drive
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [None]:
#mount the drive
drive.mount('/content/drive')
#get the file
df=pd.read_csv('/content/drive/MyDrive/Forecasting/Canada Airport Data.csv')
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Merged,Value
0,2022-01-01,79100
1,2022-01-02,72224
2,2022-01-03,70459
3,2022-01-04,64980
4,2022-01-05,62229


Data is from 01 Jan 2022 till 07 Jul 2024 and 919 rows

In [None]:
#define functions
def conversion(data):
  data['Merged'] = pd.to_datetime(data['Merged'])
  data['Value'] = data['Value'].str.replace(',','').astype(int)
  data.columns = ['Dates', 'Value']
  return data


In [None]:
  def mean_absolute_percentage_error(y_true, y_pred):
      """Calculates MAPE given y_true and y_pred"""
      y_true, y_pred = np.array(y_true), np.array(y_pred)
      return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
#convert data
df_converted=conversion(df)

In [None]:
df_converted.set_index('Dates',inplace=True)

In [None]:
df_converted.head()

Unnamed: 0_level_0,Value
Dates,Unnamed: 1_level_1
2022-01-01,79100
2022-01-02,72224
2022-01-03,70459
2022-01-04,64980
2022-01-05,62229


In [None]:
#split for train and test
split_date = '2024-01-01'
df_train = df_converted.loc[df_converted.index <= split_date].copy()
df_test = df_converted.loc[df_converted.index > split_date].copy()

In [None]:
#check the size of the data
df_train.shape, df_test.shape

((731, 1), (188, 1))

In [None]:

df_train.tail(5), df_test.head(5), df_test.tail(5)

(             Value
 Dates             
 2023-12-28  163118
 2023-12-29  147758
 2023-12-30  154908
 2023-12-31  159527
 2024-01-01  154908,
              Value
 Dates             
 2024-01-02  159527
 2024-01-03  158227
 2024-01-04  154058
 2024-01-05  155036
 2024-01-06  149747,
              Value
 Dates             
 2024-07-03  181662
 2024-07-04  177131
 2024-07-05  181447
 2024-07-06  178767
 2024-07-07  195630)

In [None]:
# Apply Simple Exponential Smoothing
model_linear = SimpleExpSmoothing(df_train)
fit_linear = model_linear.fit(smoothing_level=0.6, optimized=False)
df['SES'] = fit_linear.fittedvalues
fig = px.line(df, x=df.index, y=['Value', 'SES'],title="Simple Exponential Smoothening")
fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [None]:
# Forecast
forecast_linear = fit_linear.forecast(steps=len(df_test))
forecast_linear.sample(10)

2024-03-07    156005.453549
2024-06-23    156005.453549
2024-01-30    156005.453549
2024-05-28    156005.453549
2024-01-17    156005.453549
2024-04-24    156005.453549
2024-01-07    156005.453549
2024-06-10    156005.453549
2024-06-28    156005.453549
2024-02-19    156005.453549
dtype: float64

Not suitable for forecasting,which is expected because data has trend try Holt's linear

In [None]:
# Apply Holt's Linear Trend Model
model_holts_linear = ExponentialSmoothing(df_train, trend='add')
fit_holts_linear = model_holts_linear.fit(smoothing_level=0.6)
df['Holt'] = fit_holts_linear.fittedvalues
fig = px.line(df, x=df.index, y=['Value','Holt'],title="Holt's Linear Trend Model")
fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [None]:
# Forecast using Holts Linear
forecast_holts_linear = fit_holts_linear.forecast(steps=len(df_test))
forecast_holts_linear.sample(10)

2024-05-25    175510.225495
2024-04-21    170962.001556
2024-06-16    178453.193926
2024-02-17    162400.638848
2024-03-01    164139.665648
2024-04-30    172165.943187
2024-01-26    159457.670417
2024-02-20    162801.952725
2024-06-03    176714.167126
2024-02-27    163738.351771
dtype: float64

In [None]:
# Forecast
forecast_holts_linear = fit_holts_linear.forecast(steps=len(df_test))
# Calculate accuracy metrics
mae_holts_linear = mean_absolute_error(df_test, forecast_holts_linear)
mse_holts_linear = mean_squared_error(df_test, forecast_holts_linear)
rmse_holts_linear = np.sqrt(mse_holts_linear)
mape_holts_linear = mean_absolute_percentage_error(df_test, forecast_holts_linear)
print(f'Mean Absolute Error: {mae_holts_linear:.2f}')
print(f'Mean Squared Error: {mse_holts_linear:.2f}')
print(f'Root Mean Squared Error: {rmse_holts_linear:.2f}')
print(f'Mean Absolute Percentage Error: {mape_holts_linear:.2f}%')


Mean Absolute Error: 19084.82
Mean Squared Error: 473203331.19
Root Mean Squared Error: 21753.24
Mean Absolute Percentage Error: 14.87%


In [None]:
#to get more metrics fopr accuracy convert to df
df_test['Forecast Holt Linear'] = forecast_holts_linear
df_test.head()

Unnamed: 0_level_0,Value,Forecast Holt Linear
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-01-02,159527,156247.159401
2024-01-03,158227,156380.930693
2024-01-04,154058,156514.701986
2024-01-05,155036,156648.473278
2024-01-06,149747,156782.24457


In [None]:
#plot the values
fig = px.line(df_test, x=df_test.index, y=['Value','Forecast Holt Linear'],title="Actual vs Forecasted(Holt's Linear)")
fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [None]:
# Calculate the forecast error
df_test['Forecast Error Holt Linear'] = df_test['Value']- df_test['Forecast Holt Linear']
# Calculate the Mean Forecast Error (MFE)
mfe = df_test['Forecast Error Holt Linear'].mean()
# Calculate the Mean of Actual Values
mean_actual = df_test['Value'].mean()
# Calculate the Forecast Bias Percentage
forecast_bias_percentage = (mfe / mean_actual) * 100
print(f'Mean Forecast Error (MFE): {mfe:.2f}')
print(f'Mean of Actual Values: {mean_actual:.2f}')
print(f'Forecast Bias Percentage:: {forecast_bias_percentage:.2f}%')
# Interpretation
if forecast_bias_percentage > 0:
    print("The forecasts are biased towards underestimating the actual values.")
elif forecast_bias_percentage < 0:
    print("The forecasts are biased towards overestimating the actual values.")
else:
    print("The forecasts are unbiased on average.")

Mean Forecast Error (MFE): -17958.07
Mean of Actual Values: 150796.70
Forecast Bias Percentage:: -11.91%
The forecasts are biased towards overestimating the actual values.


Cearly This is not a good fit for our modeling.Lets add seasonality to it

In [None]:
# Apply Holt-Winters Seasonal Model
model_seasonal = ExponentialSmoothing(df_train, trend='add', seasonal='add',seasonal_periods=365)
fit_seasonal = model_seasonal.fit(smoothing_level=0.7)
# Plotting
df['Holt-Winters'] = fit_seasonal.fittedvalues
fig = px.line(df, x=df.index, y=['Value', 'Holt-Winters'],title="Holt-Winters Seasonal Model")
fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [None]:

# Forecast
forecast_seasonal = fit_seasonal.forecast(steps=len(df_test))
df_seasonal = pd.DataFrame({'Value': forecast_seasonal})
df_seasonal.rename(columns={'Value': 'Forecast_seasonal'}, inplace=True)
df_seasonal.index = df_test.index
df_seasonal.head()

Unnamed: 0_level_0,Forecast_seasonal
Dates,Unnamed: 1_level_1
2024-01-02,146227.246098
2024-01-03,143698.011951
2024-01-04,141058.993741
2024-01-05,140904.93671
2024-01-06,134902.897174


In [None]:
combined_df = pd.concat([df_test, df_seasonal], axis=1)
combined_df.head()

Unnamed: 0_level_0,Value,Forecast Holt Linear,Forecast Error Holt Linear,Forecast_seasonal
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-01-02,159527,156247.159401,3279.840599,146227.246098
2024-01-03,158227,156380.930693,1846.069307,143698.011951
2024-01-04,154058,156514.701986,-2456.701986,141058.993741
2024-01-05,155036,156648.473278,-1612.473278,140904.93671
2024-01-06,149747,156782.24457,-7035.24457,134902.897174


In [None]:
fig = px.line(combined_df, x=combined_df.index, y=['Value','Forecast_seasonal'],title="Actual vs Forecasted(Holt's Seasonal)")
fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [None]:
# Calculate accuracy metrics
mae_seasonal = mean_absolute_error(combined_df['Value'],combined_df['Forecast_seasonal'])
mse_seasonal = mean_squared_error(combined_df['Value'],combined_df['Forecast_seasonal'])
rmse_seasonal = np.sqrt(mse_seasonal)
mape_seasonal = mean_absolute_percentage_error(combined_df['Value'],combined_df['Forecast_seasonal'])
print(f'Mean Absolute Error: {mae_seasonal:.2f}')
print(f'Mean Squared Error: {mse_seasonal:.2f}')
print(f'Root Mean Squared Error: {rmse_seasonal:.2f}')
print(f'Mean Absolute Percentage Error: {mape_seasonal:.2f}%')


Mean Absolute Error: 14295.19
Mean Squared Error: 297301966.08
Root Mean Squared Error: 17242.45
Mean Absolute Percentage Error: 9.37%


In [None]:
# Calculate the forecast error
combined_df['Forecast Error Holt Seasonal'] = combined_df['Value'] - combined_df['Forecast_seasonal']
# Calculate the Mean Forecast Error (MFE)
mfe = combined_df['Forecast Error Holt Seasonal'].mean()
# Calculate the Mean of Actual Values
mean_actual = combined_df['Value'].mean()
# Calculate the Forecast Bias Percentage
forecast_bias_percentage = (mfe / mean_actual) * 100
# Print the results
print(f'Mean Forecast Error (MFE): {mfe:.2f}')
print(f'Mean of Actual Values: {mean_actual:.2f}')
print(f'Forecast Bias Percentage:: {forecast_bias_percentage:.2f}%')
# Interpretation
if forecast_bias_percentage > 0:
    print("The forecasts are biased towards underestimating the actual values.")
elif forecast_bias_percentage < 0:
    print("The forecasts are biased towards overestimating the actual values.")
else:
    print("The forecasts are unbiased on average.")

Mean Forecast Error (MFE): 13266.98
Mean of Actual Values: 150796.70
Forecast Bias Percentage:: 8.80%
The forecasts are biased towards underestimating the actual values.


In [None]:
print(df[df.index=='2024-01-01']), print(df[df.index=='2024-01-02'])

             Value            SES           Holt   Holt-Winters
Dates                                                          
2024-01-01  154908  157651.633872  157921.470272  166765.193097
             Value  SES  Holt  Holt-Winters
Dates                                      
2024-01-02  159527  NaN   NaN           NaN


(None, None)

In [None]:
# forecast for the rest of the year
forecast_final = fit_seasonal.forecast(steps=214+len(df_test)).astype(int)
df_final = pd.DataFrame({'Forecast': forecast_final})
df_final.reset_index(inplace=True)
df_final.rename(columns={'index': 'Dates'}, inplace=True)
df_final=df_final[(df_final['Dates']<'2025-01-01')]
# df_final['Forecast'].astype(int)
df_final.head() ,df_final.tail()



(       Dates  Forecast
 0 2024-01-02    146227
 1 2024-01-03    143698
 2 2024-01-04    141058
 3 2024-01-05    140904
 4 2024-01-06    134902,
          Dates  Forecast
 360 2024-12-27    136703
 361 2024-12-28    145685
 362 2024-12-29    155508
 363 2024-12-30    142852
 364 2024-12-31    154245)

In [None]:

df_converted.head()



Unnamed: 0_level_0,Value,SES,Holt,Holt-Winters
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01-01,79100,79100.0,78754.4,131425.512678
2022-01-02,72224,79100.0,75210.508639,81534.247128
2022-01-03,70459,74974.4,69633.805962,72283.114475
2022-01-04,64980,72265.16,66353.393932,68388.717676
2022-01-05,62229,67894.064,61738.402392,65829.088672


In [None]:
df_final.head(),df_final.columns
df_converted.reset_index(inplace=True)
df_converted.rename(columns={'index': 'Dates'}, inplace=True)
df_converted.head()

Unnamed: 0,Dates,Value,SES,Holt,Holt-Winters
0,2022-01-01,79100,79100.0,78754.4,131425.512678
1,2022-01-02,72224,79100.0,75210.508639,81534.247128
2,2022-01-03,70459,74974.4,69633.805962,72283.114475
3,2022-01-04,64980,72265.16,66353.393932,68388.717676
4,2022-01-05,62229,67894.064,61738.402392,65829.088672


In [None]:
combined_df = df_converted.merge(df_final[['Dates','Forecast']],on ='Dates' ,how='outer')
combined_df.head()

Unnamed: 0,Dates,Value,SES,Holt,Holt-Winters,Forecast
0,2022-01-01,79100.0,79100.0,78754.4,131425.512678,
1,2022-01-02,72224.0,79100.0,75210.508639,81534.247128,
2,2022-01-03,70459.0,74974.4,69633.805962,72283.114475,
3,2022-01-04,64980.0,72265.16,66353.393932,68388.717676,
4,2022-01-05,62229.0,67894.064,61738.402392,65829.088672,


In [None]:
combined_df=combined_df[['Dates','Value','Forecast']]
print(combined_df[(combined_df['Dates']>'2024-01-01')&(combined_df['Dates']<'2024-08-07')])
#

         Dates     Value  Forecast
731 2024-01-02  159527.0  146227.0
732 2024-01-03  158227.0  143698.0
733 2024-01-04  154058.0  141058.0
734 2024-01-05  155036.0  140904.0
735 2024-01-06  149747.0  134902.0
..         ...       ...       ...
944 2024-08-02       NaN  164165.0
945 2024-08-03       NaN  174964.0
946 2024-08-04       NaN  172659.0
947 2024-08-05       NaN  156250.0
948 2024-08-06       NaN  159408.0

[218 rows x 3 columns]


In [None]:
fig = px.line(combined_df, x='Dates', y=['Value', 'Forecast'])
fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [None]:
# Combined_final_forecasted = combined_df.rename(columns={'Forecast': 'Predicted'})
Combined_final_forecasted=combined_df[combined_df['Dates']<'2025-01-01']
Combined_final_forecasted.head(),Combined_final_forecasted.tail()

(       Dates    Value  Forecast
 0 2022-01-01  79100.0       NaN
 1 2022-01-02  72224.0       NaN
 2 2022-01-03  70459.0       NaN
 3 2022-01-04  64980.0       NaN
 4 2022-01-05  62229.0       NaN,
           Dates  Value  Forecast
 1091 2024-12-27    NaN  136703.0
 1092 2024-12-28    NaN  145685.0
 1093 2024-12-29    NaN  155508.0
 1094 2024-12-30    NaN  142852.0
 1095 2024-12-31    NaN  154245.0)

In [None]:
Combined_final_forecasted['Actual'] = Combined_final_forecasted['Value'].dropna().astype(int)
Combined_final_forecasted['Forecast'].dropna().astype(int)
Combined_final_forecasted.to_csv('/content/drive/MyDrive/Forecasting/Exponential_smoothening_forecast.csv', index=False)




*  **Accuracy**:

MAE and RMSE: Both metrics are relatively large, indicating significant errors in absolute terms. The RMSE being larger than MAE is expected because RMSE penalizes larger errors more.

MAPE: A MAPE of 9.37% suggests moderate accuracy in percentage terms. This means that, on average, the forecasted values are off by about 9.37% from the actual values.
Systematic Bias:

MFE and Forecast Bias Percentage: The positive MFE and a forecast bias percentage of 8.798% indicate that the model tends to underestimate the actual values by nearly 8.8%. This is a systematic bias that might need correction.
Model Performance:

The model has a moderate level of accuracy with a noticeable systematic underestimation bias. The relatively high MAE and RMSE suggest that the errors are significant, which could be problematic depending on the application.


