In [1]:
#Import the libraries
import numpy as np
import pandas as pd

import pickle

import warnings
warnings.filterwarnings("ignore")

In [2]:
train_data = pickle.load(open('clean_dataset_2022/train_set.bin', 'rb'))

test_data = pickle.load(open('clean_dataset_2022/test_set.bin', 'rb'))

## SARIMAX

In [3]:
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX

from matplotlib.pyplot import figure

### Resample

In [4]:
province = ['BKK','CNX','KKC','RAY','SARA','SURAT']

In [5]:
for p in province:
    train_data[p] = train_data[p].resample('6H').mean()

    test_data[p] = test_data[p].resample('6H').mean()

In [32]:
test_data

{'BKK':                           Temp  WindSpeed     WindDir       PM25
 Time                                                            
 2020-07-01 00:00:00  28.420000  52.000000  264.000000   9.600000
 2020-07-01 06:00:00  29.550000  48.666667  260.000000  12.333333
 2020-07-01 12:00:00  29.883333  36.166667  270.833333  17.166667
 2020-07-01 18:00:00  28.833333  51.833333  273.333333  11.833333
 2020-07-02 00:00:00  28.416667  38.666667  274.166667  12.666667
 ...                        ...        ...         ...        ...
 2021-06-30 18:00:00  27.833333  21.833333  261.666667  15.000000
 2021-07-01 00:00:00  27.350000  22.833333  274.166667  15.000000
 2021-07-01 06:00:00  29.883333  19.333333  255.000000  14.000000
 2021-07-01 12:00:00  31.933333  18.166667  240.000000  14.000000
 2021-07-01 18:00:00  29.100000  16.800000  247.000000  13.800000
 
 [1464 rows x 4 columns],
 'CNX':                           Temp  WindSpeed     WindDir       PM25
 Time                             

### Split 70% 30%

In [6]:
train_set = {} ; valid_set = {}

ratio = 0.7

for p in province:
    train_size, valid_size = int(ratio*train_data[p].shape[0]), int((1-ratio)*train_data[p].shape[0])
    train_set[p], valid_set[p] = train_data[p].iloc[:train_size], train_data[p].iloc[train_size: ]

### Tuning Parameters

In [7]:
order = (4, 0, 2)
seasonal_order = (1, 0, 1, 1461) # 365.25 * 4

exog_order = (5, 0, 4)
exog_seasonal_order = (1, 0, 1, 1461) # 365.25 * 4

exog_columns = ['Temp', 'WindSpeed', 'WindDir']

### Training 6 provinces with *minimal_SARIMAX*

In [8]:
from importlib import reload

In [9]:
from custom_function import minimalSARIMAX

reload(minimalSARIMAX)

from custom_function.minimalSARIMAX import MinimalSARIMAX

In [10]:
model = {}
model_exog = {}

for p in province:
    model[p] = MinimalSARIMAX(train_data[p][['PM25']],
                order,
                seasonal_order,
                exog=train_data[p][exog_columns])
    
    model_exog[p] = {}    
    for exog in exog_columns:
        model_exog[p][exog] = MinimalSARIMAX(train_data[p][[exog]],
                        exog_order,
                        exog_seasonal_order)

In [11]:
for p in province:
    model[p].fit(lr=1e-5, lr_decay=0.999 ,verbose=0)

100%|██████████| 4383/4383 [00:00<00:00, 4811.23it/s]
100%|██████████| 4383/4383 [00:00<00:00, 4551.42it/s]
100%|██████████| 4383/4383 [00:00<00:00, 5312.82it/s]
100%|██████████| 4383/4383 [00:01<00:00, 4369.91it/s]
100%|██████████| 4383/4383 [00:00<00:00, 5000.21it/s]
100%|██████████| 2555/2555 [00:00<00:00, 5424.62it/s]


In [12]:
for p in province:
    for exog in exog_columns:
        model_exog[p][exog].fit(lr=1e-5, lr_decay=0.999 ,verbose=0)

100%|██████████| 4383/4383 [00:01<00:00, 2864.75it/s]
100%|██████████| 4383/4383 [00:01<00:00, 3162.36it/s]
100%|██████████| 4383/4383 [00:01<00:00, 3258.74it/s]
100%|██████████| 4383/4383 [00:01<00:00, 3854.94it/s]
100%|██████████| 4383/4383 [00:01<00:00, 3405.59it/s]
100%|██████████| 4383/4383 [00:00<00:00, 4908.21it/s]
100%|██████████| 4383/4383 [00:00<00:00, 4707.84it/s]
100%|██████████| 4383/4383 [00:00<00:00, 4551.44it/s]
100%|██████████| 4383/4383 [00:01<00:00, 4369.94it/s]
100%|██████████| 4383/4383 [00:01<00:00, 4374.25it/s]
100%|██████████| 4383/4383 [00:00<00:00, 4483.74it/s]
100%|██████████| 4383/4383 [00:00<00:00, 4463.31it/s]
100%|██████████| 4383/4383 [00:00<00:00, 4723.15it/s]
100%|██████████| 4383/4383 [00:01<00:00, 3343.25it/s]
100%|██████████| 4383/4383 [00:01<00:00, 3952.19it/s]
100%|██████████| 2555/2555 [00:00<00:00, 3906.80it/s]
100%|██████████| 2555/2555 [00:00<00:00, 3422.37it/s]
100%|██████████| 2555/2555 [00:01<00:00, 2392.86it/s]


In [13]:
y_pred = {} ; Err = {}

In [14]:
y_pred['BKK'], Err['BKK'] = model['BKK'].predict(train_data['BKK'][['PM25']], y_X=train_data['BKK'][exog_columns], verbose=0)

In [15]:
model['BKK'].RMSE(y_pred['BKK'], train_data['BKK'][['PM25']])

Test on SARIMAX with RMSE: 8.00366456684599


In [16]:
df_test = valid_set['BKK'].copy()

In [17]:
y_t = df_test.iloc[:,[0]].to_numpy().ravel()

y_Xt = df_test[exog_columns].to_numpy()

y_pred = [10]

Error = [y_t[0]-10]


verbose = 0

for t in range(1,len(y_t)):
    pred = {} ; x = {}
    pred['p'], x['p'] = model['BKK'].p_prediction(y_t, t)
    pred['q'], x['q'] = model['BKK'].q_prediction(y_t, Error, t)
    pred['pX'], x['pX'] = model['BKK'].pX_prediction(y_Xt, t)
    pred['P'], x['P'] = model['BKK'].P_prediction(y_t, t)
    pred['Q'], x['Q'] = model['BKK'].Q_prediction(y_t, Error, t)

    pred['y'] = (pred['p'] + pred['q'] + pred['P'] + pred['Q'] + model['BKK'].params['c']).sum()
    
    y_pred.append(pred['y'])

    error_t = y_t[t] - pred['y']

    if verbose:
        print(t, pred['y'], y_t[t], error_t)

    Error.append(error_t)

y_pred_tmp = df_test.iloc[:,[0]].copy()
y_pred_tmp['PM25'] = np.array(y_pred)

In [18]:
# cnx_data = pd.concat((cnx_train, cnx_valid, cnx_test), axis=0)
# bkk_data = pd.concat((bkk_train, bkk_valid, bkk_test), axis=0)

In [19]:
# model_cnx.plot(cnx_data['PM25'], cnx_y_pred['PM25'], "Chiangmai PM2.5 Prediction")

In [20]:
# model_bkk.plot(bkk_data['PM25'], bkk_y_pred['PM25'], "Bangkok PM2.5 Prediction")

In [21]:
# # Open file - Write binary mode
# model_file = open('mod_cnx[0-1-1_1-1-0-365].model', 'wb')

# # Save Decision tree model
# pickle.dump(mod_cnx, model_file)

# # Close file
# model_file.close()

In [22]:
# test_exog = pd.concat((cnx_valid[exog_columns], cnx_test[exog_columns]), axis=0)

In [23]:
# bkk_data = pd.concat((bkk_train, cnx_valid, bkk_test), axis=0)

# Grid Search

In [24]:
# from sklearn.model_selection import GridSearchCV

In [25]:
import itertools
import statsmodels.api as sm

In [33]:
p = d = q = range(0, 3)
pdq = list(itertools.product(p, d, q))
pdq

[(0, 0, 0),
 (0, 0, 1),
 (0, 0, 2),
 (0, 1, 0),
 (0, 1, 1),
 (0, 1, 2),
 (0, 2, 0),
 (0, 2, 1),
 (0, 2, 2),
 (1, 0, 0),
 (1, 0, 1),
 (1, 0, 2),
 (1, 1, 0),
 (1, 1, 1),
 (1, 1, 2),
 (1, 2, 0),
 (1, 2, 1),
 (1, 2, 2),
 (2, 0, 0),
 (2, 0, 1),
 (2, 0, 2),
 (2, 1, 0),
 (2, 1, 1),
 (2, 1, 2),
 (2, 2, 0),
 (2, 2, 1),
 (2, 2, 2)]

In [34]:
pdqs = [(x[0], x[1], x[2], 1461) for x in list(itertools.product(p, d, q))]
pdqs

[(0, 0, 0, 1461),
 (0, 0, 1, 1461),
 (0, 0, 2, 1461),
 (0, 1, 0, 1461),
 (0, 1, 1, 1461),
 (0, 1, 2, 1461),
 (0, 2, 0, 1461),
 (0, 2, 1, 1461),
 (0, 2, 2, 1461),
 (1, 0, 0, 1461),
 (1, 0, 1, 1461),
 (1, 0, 2, 1461),
 (1, 1, 0, 1461),
 (1, 1, 1, 1461),
 (1, 1, 2, 1461),
 (1, 2, 0, 1461),
 (1, 2, 1, 1461),
 (1, 2, 2, 1461),
 (2, 0, 0, 1461),
 (2, 0, 1, 1461),
 (2, 0, 2, 1461),
 (2, 1, 0, 1461),
 (2, 1, 1, 1461),
 (2, 1, 2, 1461),
 (2, 2, 0, 1461),
 (2, 2, 1, 1461),
 (2, 2, 2, 1461)]

In [44]:
# Define function
def sarimax_gridsearch(pm_train, exo_train, pm_test, exo_test, pdq, PDQs):
    '''
    Input: 
        pm_train: PM2.5 training data
        exo_train: exogenous training data
        pdq : ARIMA combinations 
        pdqs : seasonal ARIMA combinations 

    Return:
        Prints out top 5 parameter combinations
        Returns dataframe of parameter combinations ranked by RMSE
    '''
    ans = []
    for comb in pdq:
        for combs in PDQs:
            p, d, q = comb[0], comb[1], comb[2]
            P, D, Q = combs[0], combs[1], combs[2]
            if (p+q <= 2) and (d <= 2) and (d+D <= 2) and (P+Q <= 1):  
                try:
                    model = MinimalSARIMAX(pm_train, comb, combs,exog=exo_train)
                    model.fit(lr=1e-5, lr_decay=0.999 ,verbose=0) 
                    y_pred, err = model.predict(pm_test, y_X=exo_test, verbose=0)
                    rmse = model.scoring(y_pred, pm_test)
                    ans.append([comb, combs, rmse])
                    print(f'SARIMAX {comb} x {combs}12 : RMSE Calculated ={rmse}')
                except Exception as e: 
                    print(e)
                    continue

    # Convert into a dataframe
    ans_df = pd.DataFrame(ans, columns=['pdq', 'pdqs', 'rmse'])

    # Sort and return top 5 combinations
    ans_df = ans_df.sort_values(by=['rmse'],ascending=True)[0:5]
    
    return ans_df

In [54]:
pm_train = train_data['BKK'][['PM25']]
pm_test = test_data['BKK'][['PM25']]
exo_train = train_data['BKK'][exog_columns]
exo_test = test_data['BKK'][exog_columns]
print(pm_test.isna().sum())

PM25    66
dtype: int64


In [45]:
sarimax_gridsearch(pm_train, exo_train, pm_test, exo_test, pdq, pdqs)

100%|██████████| 4383/4383 [00:00<00:00, 9131.29it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


100%|██████████| 4383/4383 [00:00<00:00, 8577.34it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


100%|██████████| 4383/4383 [00:00<00:00, 9285.98it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


100%|██████████| 4383/4383 [00:00<00:00, 9246.89it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 1)


100%|██████████| 4383/4383 [00:00<00:00, 8731.07it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


100%|██████████| 4383/4383 [00:00<00:00, 9037.16it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 1)


100%|██████████| 4383/4383 [00:00<00:00, 9169.59it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


100%|██████████| 4383/4383 [00:00<00:00, 7954.70it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


100%|██████████| 4383/4383 [00:00<00:00, 8908.61it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


100%|██████████| 4383/4383 [00:00<00:00, 8543.95it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 1)


100%|██████████| 4383/4383 [00:00<00:00, 8560.63it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


100%|██████████| 4383/4383 [00:00<00:00, 9074.55it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 1)


100%|██████████| 4383/4383 [00:00<00:00, 9188.91it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


100%|██████████| 4383/4383 [00:00<00:00, 8981.63it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


100%|██████████| 4383/4383 [00:00<00:00, 8944.94it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


100%|██████████| 4383/4383 [00:00<00:00, 8809.46it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 1)


100%|██████████| 4383/4383 [00:00<00:00, 8645.08it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


100%|██████████| 4383/4383 [00:00<00:00, 8981.72it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 2)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 2)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 2)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 2)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 2)


  0%|          | 0/4383 [00:00<?, ?it/s]

matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 2)



  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 2)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 2)


  0%|          | 0/4383 [00:00<?, ?it/s]


matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 2)


100%|██████████| 4383/4383 [00:00<00:00, 7954.63it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


100%|██████████| 4383/4383 [00:00<00:00, 8854.53it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


100%|██████████| 4383/4383 [00:00<00:00, 8926.79it/s]


Input contains NaN, infinity or a value too large for dtype('float64').


 32%|███▏      | 1422/4383 [00:00<00:00, 9355.27it/s]


KeyboardInterrupt: 