In [13]:
#Import the libraries
import numpy as np
import pandas as pd

import pickle

import random
import warnings
warnings.filterwarnings("ignore")

import itertools
from tqdm import tqdm

In [14]:
train_data = pickle.load(open('clean_dataset_2022/train_set.bin', 'rb'))

test_data = pickle.load(open('clean_dataset_2022/test_set.bin', 'rb'))

In [15]:
province = ['BKK','CNX','KKC','RAY','SARA','SURAT']
exog_columns = ['Temp', 'WindSpeed', 'WindDir']

In [16]:
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX

from matplotlib.pyplot import figure
import matplotlib.pyplot as plt

from importlib import reload

from custom_function import minimalSARIMAX

reload(minimalSARIMAX)

from custom_function.minimalSARIMAX import MinimalSARIMAX

In [17]:
p = d = q = range(0, 3)
pdq = list(itertools.product(p, d, q))

P = Q = D = range(0, 2) ; S = 1461
pdqs = [(x[0], x[1], x[2], S) for x in list(itertools.product(P, D, Q))]

In [18]:
# Define function
def sarimax_randomsearch(y_train, y_test, pdq, PDQs, y_val = None, x_train = None, x_test = None, x_val = None, model_exog=None, verbose=0, n_rand=10):
    '''
    Input: 
        y_train: training data
        y_test: test data
        pdq : ARIMA combinations 
        pdqs : seasonal ARIMA combinations 
        x_train: exogenous training data
        x_test: exogenous test data

    Return:
        Returns dataframe of parameter combinations with the least RMSE
    '''

    ans_df = pd.DataFrame(columns=['pdq', 'pdqs', 'rmse'])

    save_comb = set()
    
    i = 0
    while i!=n_rand:
        comb = random.sample(pdq, 1)[0]
        combs = random.sample(PDQs, 1)[0]

        if (comb+combs) in save_comb:
            continue
        
        save_comb.add(comb+combs)
        
        p, d, q = comb[0], comb[1], comb[2]
        P, D, Q = combs[0], combs[1], combs[2]
        if (d <= 1) and (D <= 1) and (P <= 1) and (Q <= 1):  
            model = MinimalSARIMAX(y_train, comb, combs, exog=x_train)
            model.fit(lr=1e-6, lr_decay=0.999, verbose=0) 

            if (y_val is None):
                y_pred, err = model.predict(y_test, y_exog=x_test, verbose=verbose)
                rmse = model.scoring(y_pred, y_test)

            else:
                Result = model.predict_step(y_val, y_test, val_X_exog=x_val, y_exog=x_test,
                                            model_exog=model_exog, lr=np.array([1e-6, 1e-6, 1e-6, 5e-7]), lr_decay=0.999875,
                                            learn=True, verbose=verbose, verbose_rmse=0)

                _, y_pred_sav, _ = Result
                
                rmse = model.scoring(y_pred_sav.iloc[:,[1]], y_pred_sav.iloc[:,[2]])
            
            print(f"ITER#{i} {comb} {combs} {rmse}") ; i=i+1
            
            ans_df = ans_df.append({'pdq':comb, 'pdqs':combs, 'rmse':rmse}, ignore_index=True)
        else: continue

    # Sort and return a combination with the lowest RMSE
    ans_df = ans_df.sort_values(by=['rmse'],ascending=True)
    
    return ans_df

In [19]:
ratio = 0.7

pm_train_bkk = train_data[province[0]][['PM25']][:int(ratio*train_data[province[0]].shape[0])]
pm_valid_bkk = train_data[province[0]][['PM25']][int(ratio*train_data[province[0]].shape[0]):]
pm_test_bkk = test_data[province[0]][['PM25']]
exo_train_bkk = train_data[province[0]][exog_columns][:int(ratio*train_data[province[0]].shape[0])]
exo_train_bkk['WindDirSin'] = pd.DataFrame(np.sin(exo_train_bkk['WindDir'])*10 + 10)
exo_train_bkk['WindDirCos'] = pd.DataFrame(np.cos(exo_train_bkk['WindDir'])*10 + 10)
exo_valid_bkk = train_data[province[0]][exog_columns][int(ratio*train_data[province[0]].shape[0]):]
exo_valid_bkk['WindDirSin'] = pd.DataFrame(np.sin(exo_valid_bkk['WindDir'])*10 + 10)
exo_valid_bkk['WindDirCos'] = pd.DataFrame(np.cos(exo_valid_bkk['WindDir'])*10 + 10)
exo_test_bkk = test_data[province[0]][exog_columns]
exo_test_bkk['WindDir'] = pd.to_numeric(exo_test_bkk['WindDir'], errors='coerce')
exo_test_bkk['WindDirSin'] = pd.DataFrame(np.sin(exo_test_bkk['WindDir'])*10 + 10)
exo_test_bkk['WindDirCos'] = pd.DataFrame(np.cos(exo_test_bkk['WindDir'])*10 + 10)


pm_train_cnx = train_data[province[0]][['PM25']][:int(ratio*train_data[province[0]].shape[0])]
pm_valid_cnx = train_data[province[0]][['PM25']][int(ratio*train_data[province[0]].shape[0]):]
pm_test_cnx = test_data[province[0]][['PM25']]
exo_train_cnx = train_data[province[0]][exog_columns][:int(ratio*train_data[province[0]].shape[0])]
exo_train_cnx['WindDirSin'] = pd.DataFrame(np.sin(exo_train_cnx['WindDir'])*10 + 10)
exo_train_cnx['WindDirCos'] = pd.DataFrame(np.cos(exo_train_cnx['WindDir'])*10 + 10)
exo_valid_cnx = train_data[province[0]][exog_columns][int(ratio*train_data[province[0]].shape[0]):]
exo_valid_cnx['WindDirSin'] = pd.DataFrame(np.sin(exo_valid_cnx['WindDir'])*10 + 10)
exo_valid_cnx['WindDirCos'] = pd.DataFrame(np.cos(exo_valid_cnx['WindDir'])*10 + 10)
exo_test_cnx = test_data[province[0]][exog_columns]
exo_test_cnx['WindDir'] = pd.to_numeric(exo_test_cnx['WindDir'], errors='coerce')
exo_test_cnx['WindDirSin'] = pd.DataFrame(np.sin(exo_test_cnx['WindDir'])*10 + 10)
exo_test_cnx['WindDirCos'] = pd.DataFrame(np.cos(exo_test_cnx['WindDir'])*10 + 10)


pm_train_kkc = train_data[province[0]][['PM25']][:int(ratio*train_data[province[0]].shape[0])]
pm_valid_kkc = train_data[province[0]][['PM25']][int(ratio*train_data[province[0]].shape[0]):]
pm_test_kkc = test_data[province[0]][['PM25']]
exo_train_kkc = train_data[province[0]][exog_columns][:int(ratio*train_data[province[0]].shape[0])]
exo_train_kkc['WindDirSin'] = pd.DataFrame(np.sin(exo_train_kkc['WindDir'])*10 + 10)
exo_train_kkc['WindDirCos'] = pd.DataFrame(np.cos(exo_train_kkc['WindDir'])*10 + 10)
exo_valid_kkc = train_data[province[0]][exog_columns][int(ratio*train_data[province[0]].shape[0]):]
exo_valid_kkc['WindDirSin'] = pd.DataFrame(np.sin(exo_valid_kkc['WindDir'])*10 + 10)
exo_valid_kkc['WindDirCos'] = pd.DataFrame(np.cos(exo_valid_kkc['WindDir'])*10 + 10)
exo_test_kkc = test_data[province[0]][exog_columns]
exo_test_kkc['WindDir'] = pd.to_numeric(exo_test_kkc['WindDir'], errors='coerce')
exo_test_kkc['WindDirSin'] = pd.DataFrame(np.sin(exo_test_kkc['WindDir'])*10 + 10)
exo_test_kkc['WindDirCos'] = pd.DataFrame(np.cos(exo_test_kkc['WindDir'])*10 + 10)


pm_train_ray = train_data[province[0]][['PM25']][:int(ratio*train_data[province[0]].shape[0])]
pm_valid_ray = train_data[province[0]][['PM25']][int(ratio*train_data[province[0]].shape[0]):]
pm_test_ray = test_data[province[0]][['PM25']]
exo_train_ray = train_data[province[0]][exog_columns][:int(ratio*train_data[province[0]].shape[0])]
exo_train_ray['WindDirSin'] = pd.DataFrame(np.sin(exo_train_ray['WindDir'])*10 + 10)
exo_train_ray['WindDirCos'] = pd.DataFrame(np.cos(exo_train_ray['WindDir'])*10 + 10)
exo_valid_ray = train_data[province[0]][exog_columns][int(ratio*train_data[province[0]].shape[0]):]
exo_valid_ray['WindDirSin'] = pd.DataFrame(np.sin(exo_valid_ray['WindDir'])*10 + 10)
exo_valid_ray['WindDirCos'] = pd.DataFrame(np.cos(exo_valid_ray['WindDir'])*10 + 10)
exo_test_ray = test_data[province[0]][exog_columns]
exo_test_ray['WindDir'] = pd.to_numeric(exo_test_ray['WindDir'], errors='coerce')
exo_test_ray['WindDirSin'] = pd.DataFrame(np.sin(exo_test_ray['WindDir'])*10 + 10)
exo_test_ray['WindDirCos'] = pd.DataFrame(np.cos(exo_test_ray['WindDir'])*10 + 10)


pm_train_sara = train_data[province[0]][['PM25']][:int(ratio*train_data[province[0]].shape[0])]
pm_valid_sara = train_data[province[0]][['PM25']][int(ratio*train_data[province[0]].shape[0]):]
pm_test_sara = test_data[province[0]][['PM25']]
exo_train_sara = train_data[province[0]][exog_columns][:int(ratio*train_data[province[0]].shape[0])]
exo_train_sara['WindDirSin'] = pd.DataFrame(np.sin(exo_train_sara['WindDir'])*10 + 10)
exo_train_sara['WindDirCos'] = pd.DataFrame(np.cos(exo_train_sara['WindDir'])*10 + 10)
exo_valid_sara = train_data[province[0]][exog_columns][int(ratio*train_data[province[0]].shape[0]):]
exo_valid_sara['WindDirSin'] = pd.DataFrame(np.sin(exo_valid_sara['WindDir'])*10 + 10)
exo_valid_sara['WindDirCos'] = pd.DataFrame(np.cos(exo_valid_sara['WindDir'])*10 + 10)
exo_test_sara = test_data[province[0]][exog_columns]
exo_test_sara['WindDir'] = pd.to_numeric(exo_test_sara['WindDir'], errors='coerce')
exo_test_sara['WindDirSin'] = pd.DataFrame(np.sin(exo_test_sara['WindDir'])*10 + 10)
exo_test_sara['WindDirCos'] = pd.DataFrame(np.cos(exo_test_sara['WindDir'])*10 + 10)


pm_train_surat = train_data[province[0]][['PM25']][:int(ratio*train_data[province[0]].shape[0])]
pm_valid_surat = train_data[province[0]][['PM25']][int(ratio*train_data[province[0]].shape[0]):]
pm_test_surat = test_data[province[0]][['PM25']]
exo_train_surat = train_data[province[0]][exog_columns][:int(ratio*train_data[province[0]].shape[0])]
exo_train_surat['WindDirSin'] = pd.DataFrame(np.sin(exo_train_surat['WindDir'])*10 + 10)
exo_train_surat['WindDirCos'] = pd.DataFrame(np.cos(exo_train_surat['WindDir'])*10 + 10)
exo_valid_surat = train_data[province[0]][exog_columns][int(ratio*train_data[province[0]].shape[0]):]
exo_valid_surat['WindDirSin'] = pd.DataFrame(np.sin(exo_valid_surat['WindDir'])*10 + 10)
exo_valid_surat['WindDirCos'] = pd.DataFrame(np.cos(exo_valid_surat['WindDir'])*10 + 10)
exo_test_surat = test_data[province[0]][exog_columns]
exo_test_surat['WindDir'] = pd.to_numeric(exo_test_surat['WindDir'], errors='coerce')
exo_test_surat['WindDirSin'] = pd.DataFrame(np.sin(exo_test_surat['WindDir'])*10 + 10)
exo_test_surat['WindDirCos'] = pd.DataFrame(np.cos(exo_test_surat['WindDir'])*10 + 10)

In [20]:
temp = exog_columns[0]

temp_train_bkk = pd.DataFrame(exo_train_bkk[temp])
temp_valid_bkk = pd.DataFrame(exo_valid_bkk[temp])
temp_test_bkk = pd.DataFrame(exo_test_bkk[temp])

temp_train_cnx = pd.DataFrame(exo_train_cnx[temp])
temp_valid_cnx = pd.DataFrame(exo_valid_cnx[temp])
temp_test_cnx = pd.DataFrame(exo_test_cnx[temp])

temp_train_kkc = pd.DataFrame(exo_train_kkc[temp])
temp_valid_kkc = pd.DataFrame(exo_valid_kkc[temp])
temp_test_kkc = pd.DataFrame(exo_test_kkc[temp])

temp_train_ray = pd.DataFrame(exo_train_ray[temp])
temp_valid_ray = pd.DataFrame(exo_valid_ray[temp])
temp_test_ray = pd.DataFrame(exo_test_ray[temp])

temp_train_sara = pd.DataFrame(exo_train_sara[temp])
temp_valid_sara = pd.DataFrame(exo_valid_sara[temp])
temp_test_sara = pd.DataFrame(exo_test_sara[temp])

temp_train_surat = pd.DataFrame(exo_train_surat[temp])
temp_valid_surat = pd.DataFrame(exo_valid_surat[temp])
temp_test_surat = pd.DataFrame(exo_test_surat[temp])

In [21]:
windSpeed = exog_columns[1]
windSpeed_train_bkk = pd.DataFrame(exo_train_bkk[windSpeed])
windSpeed_valid_bkk = pd.DataFrame(exo_valid_bkk[windSpeed])
windSpeed_test_bkk = pd.DataFrame(exo_test_bkk[windSpeed])

windSpeed_train_cnx = pd.DataFrame(exo_train_cnx[windSpeed])
windSpeed_valid_cnx = pd.DataFrame(exo_valid_cnx[windSpeed])
windSpeed_test_cnx = pd.DataFrame(exo_test_cnx[windSpeed])

windSpeed_train_kkc = pd.DataFrame(exo_train_kkc[windSpeed])
windSpeed_valid_kkc = pd.DataFrame(exo_valid_kkc[windSpeed])
windSpeed_test_kkc = pd.DataFrame(exo_test_kkc[windSpeed])

windSpeed_train_ray = pd.DataFrame(exo_train_ray[windSpeed])
windSpeed_valid_ray = pd.DataFrame(exo_valid_ray[windSpeed])
windSpeed_test_ray = pd.DataFrame(exo_test_ray[windSpeed])

windSpeed_train_sara = pd.DataFrame(exo_train_sara[windSpeed])
windSpeed_valid_sara = pd.DataFrame(exo_valid_sara[windSpeed])
windSpeed_test_sara = pd.DataFrame(exo_test_sara[windSpeed])

windSpeed_train_surat = pd.DataFrame(exo_train_surat[windSpeed])
windSpeed_valid_surat = pd.DataFrame(exo_valid_surat[windSpeed])
windSpeed_test_surat = pd.DataFrame(exo_test_surat[windSpeed])

In [22]:
windDir = exog_columns[2]

windDirSin_train_bkk = pd.DataFrame(np.sin(exo_train_bkk[windDir])*10 + 10)
windDirSin_valid_bkk = pd.DataFrame(np.sin(exo_valid_bkk[windDir])*10 + 10)
windDirSin_test_bkk = pd.DataFrame(np.sin(exo_test_bkk[windDir])*10 + 10)
windDirCos_train_bkk = pd.DataFrame(np.cos(exo_train_bkk[windDir])*10 + 10)
windDirCos_valid_bkk = pd.DataFrame(np.sin(exo_valid_bkk[windDir])*10 + 10)
windDirCos_test_bkk = pd.DataFrame(np.cos(exo_test_bkk[windDir])*10 + 10)

windDirSin_train_cnx = pd.DataFrame(np.sin(exo_train_cnx[windDir])*10 + 10)
windDirSin_valid_cnx = pd.DataFrame(np.sin(exo_valid_cnx[windDir])*10 + 10)
windDirSin_test_cnx = pd.DataFrame(np.sin(exo_test_cnx[windDir])*10 + 10)
windDirCos_train_cnx = pd.DataFrame(np.cos(exo_train_cnx[windDir])*10 + 10)
windDirCos_valid_cnx = pd.DataFrame(np.sin(exo_valid_cnx[windDir])*10 + 10)
windDirCos_test_cnx = pd.DataFrame(np.cos(exo_test_cnx[windDir])*10 + 10)

windDirSin_train_kkc = pd.DataFrame(np.sin(exo_train_kkc[windDir])*10 + 10)
windDirSin_valid_kkc = pd.DataFrame(np.sin(exo_valid_kkc[windDir])*10 + 10)
windDirSin_test_kkc = pd.DataFrame(np.sin(exo_test_kkc[windDir])*10 + 10)
windDirCos_train_kkc = pd.DataFrame(np.cos(exo_train_kkc[windDir])*10 + 10)
windDirCos_valid_kkc = pd.DataFrame(np.sin(exo_valid_kkc[windDir])*10 + 10)
windDirCos_test_kkc = pd.DataFrame(np.cos(exo_test_kkc[windDir])*10 + 10)

windDirSin_train_ray = pd.DataFrame(np.sin(exo_train_ray[windDir])*10 + 10)
windDirSin_valid_ray = pd.DataFrame(np.sin(exo_valid_ray[windDir])*10 + 10)
windDirSin_test_ray = pd.DataFrame(np.sin(exo_test_ray[windDir])*10 + 10)
windDirCos_train_ray = pd.DataFrame(np.cos(exo_train_ray[windDir])*10 + 10)
windDirCos_valid_ray = pd.DataFrame(np.sin(exo_valid_ray[windDir])*10 + 10)
windDirCos_test_ray = pd.DataFrame(np.cos(exo_test_ray[windDir])*10 + 10)

windDirSin_train_sara = pd.DataFrame(np.sin(exo_train_sara[windDir])*10 + 10)
windDirSin_valid_sara = pd.DataFrame(np.sin(exo_valid_sara[windDir])*10 + 10)
windDirSin_test_sara = pd.DataFrame(np.sin(exo_test_sara[windDir])*10 + 10)
windDirCos_train_sara = pd.DataFrame(np.cos(exo_train_sara[windDir])*10 + 10)
windDirCos_valid_sara = pd.DataFrame(np.sin(exo_valid_sara[windDir])*10 + 10)
windDirCos_test_sara = pd.DataFrame(np.cos(exo_test_sara[windDir])*10 + 10)

windDirSin_train_surat = pd.DataFrame(np.sin(exo_train_surat[windDir])*10 + 10)
windDirSin_valid_surat = pd.DataFrame(np.sin(exo_valid_surat[windDir])*10 + 10)
windDirSin_test_surat = pd.DataFrame(np.sin(exo_test_surat[windDir])*10 + 10)
windDirCos_train_surat = pd.DataFrame(np.cos(exo_train_surat[windDir])*10 + 10)
windDirCos_valid_surat = pd.DataFrame(np.sin(exo_valid_surat[windDir])*10 + 10)
windDirCos_test_surat = pd.DataFrame(np.cos(exo_test_surat[windDir])*10 + 10)

In [23]:
exog_columns = ['Temp', 'WindSpeed', 'WindDirSin', 'WindDirCos']

In [24]:
model_exog_surat = {}
exog_order_surat = {}
exog_seasonal_order_surat = {}

model_exog_surat['Temp'] = MinimalSARIMAX(temp_train_surat, (2, 0, 2), (0, 0, 0, 1461))
model_exog_surat['WindSpeed'] = MinimalSARIMAX(windSpeed_train_surat, (2, 1, 0), (1, 0, 1, 1461))
model_exog_surat['WindDirSin'] = MinimalSARIMAX(windDirSin_train_surat, (1, 0, 2), (0, 1, 0, 1461))
model_exog_surat['WindDirCos'] = MinimalSARIMAX(windDirCos_train_surat, (2, 0, 2), (0, 1, 0, 1461)	)

for exog in exog_columns:
    model_exog_surat[exog].fit(lr=1e-6, lr_decay=0.999, verbose=0)

result_surat = sarimax_randomsearch(pm_train_surat, pm_test_surat, pdq, pdqs, y_val=pm_valid_surat, x_train=exo_train_surat[exog_columns], x_test=exo_test_surat[exog_columns], x_val=exo_valid_surat[exog_columns],
                                  model_exog=model_exog_surat, n_rand=3, verbose=1)

display(result_surat)

100%|██████████| 16675/16675 [2:37:56<00:00,  1.76it/s]  


ITER#0 (1, 0, 1) (0, 1, 1, 1461) 11.944792679813055


100%|██████████| 16675/16675 [2:41:48<00:00,  1.72it/s]  


ITER#1 (0, 0, 2) (1, 0, 0, 1461) 11.70976219682504


100%|██████████| 16675/16675 [2:38:21<00:00,  1.76it/s]  

ITER#2 (2, 0, 2) (0, 1, 0, 1461) 11.451204403766797





Unnamed: 0,pdq,pdqs,rmse
2,"(2, 0, 2)","(0, 1, 0, 1461)",11.451204
1,"(0, 0, 2)","(1, 0, 0, 1461)",11.709762
0,"(1, 0, 1)","(0, 1, 1, 1461)",11.944793
