In [4]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

In [2]:
# Load the dataset
hcho_data = pd.read_csv('preprocessed_dataset/hcho_data.csv')
hcho_data.head()

Unnamed: 0,HCHO Reading,Location,Current Date,Next Date
0,0.000197,Colombo Proper,2019-01-01,2019-01-02
1,0.000197,"Deniyaya, Matara",2019-01-01,2019-01-02
2,0.000263,Nuwara Eliya Proper,2019-01-01,2019-01-02
3,0.000263,Colombo Proper,2019-01-02,2019-01-03
4,6e-06,"Deniyaya, Matara",2019-01-02,2019-01-03


In [3]:
# Convert 'Current Date' to datetime format
hcho_data['Current Date'] = pd.to_datetime(hcho_data['Current Date'])

### ARIMA

In [12]:
import itertools
import warnings
from statsmodels.tsa.arima.model import ARIMA

warnings.filterwarnings('ignore')

# ARIMA model parameters
p = range(0, 6)
d = range(0, 5)
q = range(0, 6)
pdq = itertools.product(p, d, q)

best_aic = float('inf')
best_order = None

# Convert the 'Current Date' column to datetime format and set it as the index
hcho_data['Current Date'] = pd.to_datetime(hcho_data['Current Date'])
hcho_data.set_index('Current Date', inplace=True)

# Assuming we're only interested in one location, filter the data if necessary
# For example, if focusing on "Colombo Proper":
# hcho_data = hcho_data[hcho_data['Location'] == 'Colombo Proper']

# Define the training and testing sets
train_size = int(len(hcho_data) * 0.6)
train_data = hcho_data[:train_size]
test_data = hcho_data[train_size:]

# Iterate over various combinations of p, d, and q to find the best ARIMA model
for order in pdq:
    try:
        model = ARIMA(train_data['HCHO Reading'], order=order)
        results = model.fit()
        aic = results.aic
        if aic < best_aic:
            best_aic = aic
            best_order = order
        print(f'ARIMA{order} - AIC: {aic:.2f}')
    except:
        continue

print(f'Best ARIMA model: ARIMA{best_order} - AIC: {best_aic:.2f}')


ARIMA(0, 0, 0) - AIC: -16880.45
ARIMA(0, 0, 1) - AIC: -17030.81
ARIMA(0, 0, 2) - AIC: -17054.93
ARIMA(0, 0, 3) - AIC: -17094.15
ARIMA(0, 0, 4) - AIC: -17120.30
ARIMA(0, 0, 5) - AIC: -17134.33
ARIMA(0, 1, 0) - AIC: -16691.36
ARIMA(0, 1, 1) - AIC: nan
ARIMA(0, 1, 2) - AIC: -17112.23
ARIMA(0, 1, 3) - AIC: nan
ARIMA(0, 1, 4) - AIC: -17117.36
ARIMA(0, 1, 5) - AIC: -17105.77
ARIMA(0, 2, 0) - AIC: -15544.84
ARIMA(0, 2, 1) - AIC: -15538.84
ARIMA(0, 2, 2) - AIC: -15540.84
ARIMA(0, 2, 3) - AIC: -16860.40
ARIMA(0, 2, 4) - AIC: -15536.26
ARIMA(0, 2, 5) - AIC: -15532.90
ARIMA(0, 3, 0) - AIC: -14251.29
ARIMA(0, 3, 1) - AIC: -14249.50
ARIMA(0, 3, 2) - AIC: -16259.89
ARIMA(0, 3, 3) - AIC: -14246.13
ARIMA(0, 3, 4) - AIC: -14242.16
ARIMA(0, 3, 5) - AIC: -14240.97
ARIMA(0, 4, 0) - AIC: -12900.08
ARIMA(0, 4, 1) - AIC: -12897.95
ARIMA(0, 4, 2) - AIC: -12897.10
ARIMA(0, 4, 3) - AIC: -12894.61
ARIMA(0, 4, 4) - AIC: -12892.04
ARIMA(0, 4, 5) - AIC: -12889.48
ARIMA(1, 0, 0) - AIC: -17084.41
ARIMA(1, 0, 1) - AIC

In [13]:
# Define the training and testing sets
train_size = int(len(hcho_data) * 0.8)
train_data = hcho_data[:train_size]
test_data = hcho_data[train_size:]

best_order = (1, 0, 5)

model = ARIMA(train_data['HCHO Reading'], order=best_order)
results = model.fit()

results.summary()

0,1,2,3
Dep. Variable:,HCHO Reading,No. Observations:,1460.0
Model:,"ARIMA(1, 0, 5)",Log Likelihood,11526.345
Date:,"Wed, 27 Mar 2024",AIC,-23036.689
Time:,13:35:11,BIC,-22994.4
Sample:,01-01-2019,HQIC,-23020.913
,- 12-30-2022,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0001,1.56e-05,9.013,0.000,0.000,0.000
ar.L1,0.9839,0.003,284.414,0.000,0.977,0.991
ma.L1,-0.7227,0.001,-754.836,0.000,-0.725,-0.721
ma.L2,-0.1092,0.001,-112.165,0.000,-0.111,-0.107
ma.L3,0.0297,0.001,33.300,0.000,0.028,0.031
ma.L4,-0.0443,0.001,-50.013,0.000,-0.046,-0.043
ma.L5,-0.0436,0.001,-51.634,0.000,-0.045,-0.042
sigma2,8.051e-09,2.34e-10,34.349,0.000,7.59e-09,8.51e-09

0,1,2,3
Ljung-Box (L1) (Q):,0.86,Jarque-Bera (JB):,143.63
Prob(Q):,0.35,Prob(JB):,0.0
Heteroskedasticity (H):,0.85,Skew:,0.19
Prob(H) (two-sided):,0.08,Kurtosis:,4.49


### FORECASTING