In [201]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
from prophet import Prophet
import warnings
warnings.filterwarnings('ignore')

In [202]:
data=pd.read_csv("/content/AirtrafficA4.csv")
df = pd.DataFrame(data)

df.head()
df.rename(columns=lambda x: x.strip(), inplace=True)
df.head()

Unnamed: 0,AIRLINE,YEAR,MONTH,TOTAL DEPARTURES,HOURS FLOWN,KILOMETRE FLOWN,PASSENGERS CARRIED,AVAILABLE SEAT KILOMETRE (IN THOUSAND),FREIGHT CARRIED (IN TONNE),MAIL CARRIED (IN TONNE)
0,A007,2023,JAN,47977,83764,41827,6847384,7832254,16881.7,2043.5
1,A007,2023,FEB,44905,77936,39121,6741948,7336614,17439.3,2086.7
2,A007,2023,MAR,50389,87296,43793,7317288,8215681,20208.4,2310.1
3,A007,2023,APR,48752,84232,42615,7406440,8005648,19432.8,2102.9
4,A007,2023,MAY,50956,87917,44505,8109626,8375201,24165.1,2102.4


In [203]:
def clean_and_preprocess(df):

    missing = df.isnull().sum(axis=1)

    df_clean= df[missing <= 2].copy()

    df_clean.fillna(method = 'ffill', inplace=True)

    columns_to_clean = ['TOTAL DEPARTURES','HOURS FLOWN','KILOMETRE FLOWN','PASSENGERS CARRIED','AVAILABLE SEAT KILOMETRE (IN THOUSAND)', 'FREIGHT CARRIED (IN TONNE)','MAIL CARRIED (IN TONNE)']

    for col in columns_to_clean:
        df_clean[col] = df_clean[col].str.replace(',', '')
        df_clean[col] = df_clean[col].astype(float)

    return df_clean

In [204]:
# Load and clean data
df_cleaned = clean_and_preprocess(df)
df_cleaned.head()
df_cleaned.isnull().sum()


Unnamed: 0,0
AIRLINE,0
YEAR,0
MONTH,0
TOTAL DEPARTURES,0
HOURS FLOWN,0
KILOMETRE FLOWN,0
PASSENGERS CARRIED,0
AVAILABLE SEAT KILOMETRE (IN THOUSAND),0
FREIGHT CARRIED (IN TONNE),0
MAIL CARRIED (IN TONNE),0


In [205]:
# Add a unique identifier to maintain original order
df_cleaned['original_index'] = df_cleaned.index

# Sort the DataFrame by 'YEAR' and then by the original index
df_sorted = df_cleaned.sort_values(by=['YEAR', 'original_index']).reset_index(drop=True)


# Add a unique identifier to maintain original order
df_sorted['original_index'] = df_sorted.index

In [206]:
#1stque:
ntraining = df_sorted['PASSENGERS CARRIED']
model = auto_arima(ntraining,seasonal=True, m=12, stepwise=True, trace=True, suppress_warnings=True,max_p=4,max_d=4,max_q=3,max_P=4,max_D=2,max_Q=4)
# print(model.summary())
sarima_model = SARIMAX(ntraining,order=model.order, seasonal_order=model.seasonal_order)
sarima_result = sarima_model.fit()


Performing stepwise search to minimize aic
 ARIMA(2,1,2)(1,0,1)[12] intercept   : AIC=3701.690, Time=2.09 sec
 ARIMA(0,1,0)(0,0,0)[12] intercept   : AIC=3713.865, Time=0.05 sec
 ARIMA(1,1,0)(1,0,0)[12] intercept   : AIC=3703.155, Time=0.27 sec
 ARIMA(0,1,1)(0,0,1)[12] intercept   : AIC=3699.204, Time=0.25 sec
 ARIMA(0,1,0)(0,0,0)[12]             : AIC=3712.789, Time=0.03 sec
 ARIMA(0,1,1)(0,0,0)[12] intercept   : AIC=3709.440, Time=0.21 sec
 ARIMA(0,1,1)(1,0,1)[12] intercept   : AIC=3701.174, Time=1.41 sec
 ARIMA(0,1,1)(0,0,2)[12] intercept   : AIC=3701.159, Time=0.28 sec
 ARIMA(0,1,1)(1,0,0)[12] intercept   : AIC=3700.246, Time=0.08 sec
 ARIMA(0,1,1)(1,0,2)[12] intercept   : AIC=3702.960, Time=0.98 sec
 ARIMA(0,1,0)(0,0,1)[12] intercept   : AIC=3704.573, Time=0.07 sec
 ARIMA(1,1,1)(0,0,1)[12] intercept   : AIC=3698.831, Time=0.18 sec
 ARIMA(1,1,1)(0,0,0)[12] intercept   : AIC=3709.870, Time=0.06 sec
 ARIMA(1,1,1)(1,0,1)[12] intercept   : AIC=3700.797, Time=0.32 sec
 ARIMA(1,1,1)(0,0,2

In [207]:
# Prepare future exogenous data
# Forecast for the next 12 months
forecast = sarima_result.get_forecast(steps=12)
future_dates = pd.date_range(start='2023-09-01', periods=12, freq='MS')

# Create submission
submission = pd.DataFrame({
    'YEAR_MONTH': future_dates.strftime('%Y %b').str.upper(),
    'PASSENGERS CARRIED': forecast.predicted_mean
})
submission.to_csv('submission.csv', index=False)

# Print diagnostics
print("\nModel Summary:")
print(sarima_result.summary())

print("\nForecast Preview:")
print(submission.head())


Model Summary:
                                      SARIMAX Results                                       
Dep. Variable:                   PASSENGERS CARRIED   No. Observations:                  127
Model:             SARIMAX(1, 1, 2)x(0, 0, [1], 12)   Log Likelihood               -1842.775
Date:                              Mon, 28 Oct 2024   AIC                           3695.550
Time:                                      17:43:10   BIC                           3709.732
Sample:                                           0   HQIC                          3701.312
                                              - 127                                         
Covariance Type:                                opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.5896      0.209      2.828      0.005       0.181       0.998
ma.