# Random Search

In [106]:
#Import the libraries
import numpy as np
import pandas as pd

import pickle

import random
import warnings
warnings.filterwarnings("ignore")

import itertools
from tqdm import tqdm

In [107]:
train_data = pickle.load(open('clean_dataset_2022/train_set.bin', 'rb'))

test_data = pickle.load(open('clean_dataset_2022/test_set.bin', 'rb'))

In [108]:
province = ['BKK','CNX','KKC','RAY','SARA','SURAT']
exog_columns = ['Temp', 'WindSpeed', 'WindDir']

In [109]:
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX

from matplotlib.pyplot import figure
import matplotlib.pyplot as plt

In [110]:
from importlib import reload

from custom_function import minimalSARIMAX

reload(minimalSARIMAX)

from custom_function.minimalSARIMAX import MinimalSARIMAX

In [111]:
p = q = range(0, 7) ; d = range(0, 2)
pdq = list(itertools.product(p, d, q))

In [112]:
P = Q = range(0, 3) ; D = range(0, 2) ; S = 1461
pdqs = [(x[0], x[1], x[2], S) for x in list(itertools.product(P, D, Q))]

In [113]:
# Define function
def sarimax_randomsearch(y_train, y_test, pdq, PDQs, y_val = None, x_train = None, x_test = None, x_val = None, model_exog=None, verbose=0, n_rand=10):
    '''
    Input: 
        y_train: training data
        y_test: test data
        pdq : ARIMA combinations 
        pdqs : seasonal ARIMA combinations 
        x_train: exogenous training data
        x_test: exogenous test data

    Return:
        Returns dataframe of parameter combinations with the least RMSE
    '''

    ans_df = pd.DataFrame(columns=['pdq', 'pdqs', 'rmse'])
    
    
    for i in range(n_rand):
        comb = random.sample(pdq, 1)[0]
        combs = random.sample(PDQs, 1)[0]
        
        p, d, q = comb[0], comb[1], comb[2]
        P, D, Q = combs[0], combs[1], combs[2]
        if (d <= 1) and (D <= 1) and (P <= 1) and (Q <= 1):  
            model = MinimalSARIMAX(y_train, comb, combs, exog=x_train)
            model.fit(lr=1e-6, lr_decay=0.999, verbose=0) 

            if (y_val is None):
                y_pred, err = model.predict(y_test, y_exog=x_test, verbose=verbose)
                rmse = model.scoring(y_pred, y_test)

            else:
                Result = model.predict_step(y_val, y_test, val_X_exog=x_val, y_exog=x_test,
                                            model_exog=model_exog, lr=np.array([1e-6, 1e-6, 1e-6, 5e-7]), lr_decay=0.999875,
                                            learn=True, verbose=verbose, verbose_rmse=0)

                _, y_pred_sav, _ = Result
                
                rmse = model.scoring(y_pred_sav.iloc[:,[1]], y_pred_sav.iloc[:,[2]])
            
            print(f"ITER#{i} {comb} {combs}")
            
            ans_df = ans_df.append({'pdq':comb, 'pdqs':combs, 'rmse':rmse}, ignore_index=True)
        else:
            i-=1

    # Sort and return a combination with the lowest RMSE
    ans_df = ans_df.sort_values(by=['rmse'],ascending=True)
    
    return ans_df

In [114]:
ratio = 0.7

pm_train_bkk = train_data[province[0]][['PM25']][:int(ratio*train_data[province[0]].shape[0])]
pm_valid_bkk = train_data[province[0]][['PM25']][int(ratio*train_data[province[0]].shape[0]):]
pm_test_bkk = test_data[province[0]][['PM25']]
exo_train_bkk = train_data[province[0]][exog_columns][:int(ratio*train_data[province[0]].shape[0])]
exo_valid_bkk = train_data[province[0]][exog_columns][int(ratio*train_data[province[0]].shape[0]):]
exo_test_bkk = test_data[province[0]][exog_columns]


pm_train_cnx = train_data[province[1]][['PM25']][:int(ratio*train_data[province[0]].shape[0])]
pm_valid_cnx = train_data[province[1]][['PM25']][int(ratio*train_data[province[0]].shape[0]):]
pm_test_cnx = test_data[province[1]][['PM25']]
exo_train_cnx = train_data[province[1]][exog_columns][:int(ratio*train_data[province[0]].shape[0])]
exo_valid_cnx = train_data[province[1]][exog_columns][int(ratio*train_data[province[0]].shape[0]):]
exo_test_cnx = test_data[province[1]][exog_columns]


pm_train_kkc = train_data[province[2]][['PM25']][:int(ratio*train_data[province[0]].shape[0])]
pm_valid_kkc = train_data[province[2]][['PM25']][int(ratio*train_data[province[0]].shape[0]):]
pm_test_kkc = test_data[province[2]][['PM25']]
exo_train_kkc = train_data[province[2]][exog_columns][:int(ratio*train_data[province[0]].shape[0])]
exo_valid_kkc = train_data[province[2]][exog_columns][int(ratio*train_data[province[0]].shape[0]):]
exo_test_kkc = test_data[province[2]][exog_columns]


pm_train_ray = train_data[province[3]][['PM25']][:int(ratio*train_data[province[0]].shape[0])]
pm_valid_ray = train_data[province[3]][['PM25']][int(ratio*train_data[province[0]].shape[0]):]
pm_test_ray = test_data[province[3]][['PM25']]
exo_train_ray = train_data[province[3]][exog_columns][:int(ratio*train_data[province[0]].shape[0])]
exo_valid_ray = train_data[province[3]][exog_columns][int(ratio*train_data[province[0]].shape[0]):]
exo_test_ray = test_data[province[3]][exog_columns]


pm_train_sara = train_data[province[4]][['PM25']][:int(ratio*train_data[province[0]].shape[0])]
pm_valid_sara = train_data[province[4]][['PM25']][int(ratio*train_data[province[0]].shape[0]):]
pm_test_sara = test_data[province[4]][['PM25']]
exo_train_sara = train_data[province[4]][exog_columns][:int(ratio*train_data[province[0]].shape[0])]
exo_valid_sara = train_data[province[4]][exog_columns][int(ratio*train_data[province[0]].shape[0]):]
exo_test_sara = test_data[province[4]][exog_columns]


pm_train_surat = train_data[province[5]][['PM25']][:int(ratio*train_data[province[0]].shape[0])]
pm_valid_surat = train_data[province[5]][['PM25']][int(ratio*train_data[province[0]].shape[0]):]
pm_test_surat = test_data[province[5]][['PM25']]
exo_train_surat = train_data[province[5]][exog_columns][:int(ratio*train_data[province[0]].shape[0])]
exo_valid_surat = train_data[province[5]][exog_columns][int(ratio*train_data[province[0]].shape[0]):]
exo_test_surat = test_data[province[5]][exog_columns]

## Tuning parameters for temperature

In [115]:
temp = exog_columns[0]

temp_train_bkk = pd.DataFrame(exo_train_bkk[temp])
temp_valid_bkk = pd.DataFrame(exo_valid_bkk[temp])
temp_test_bkk = pd.DataFrame(exo_test_bkk[temp])

temp_train_cnx = pd.DataFrame(exo_train_cnx[temp])
temp_valid_cnx = pd.DataFrame(exo_valid_cnx[temp])
temp_test_cnx = pd.DataFrame(exo_test_cnx[temp])

temp_train_kkc = pd.DataFrame(exo_train_kkc[temp])
temp_valid_kkc = pd.DataFrame(exo_valid_kkc[temp])
temp_test_kkc = pd.DataFrame(exo_test_kkc[temp])

temp_train_ray = pd.DataFrame(exo_train_ray[temp])
temp_valid_ray = pd.DataFrame(exo_valid_ray[temp])
temp_test_ray = pd.DataFrame(exo_test_ray[temp])

temp_train_sara = pd.DataFrame(exo_train_sara[temp])
temp_valid_sara = pd.DataFrame(exo_valid_sara[temp])
temp_test_sara = pd.DataFrame(exo_test_sara[temp])

temp_train_surat = pd.DataFrame(exo_train_surat[temp])
temp_valid_surat = pd.DataFrame(exo_valid_surat[temp])
temp_test_surat = pd.DataFrame(exo_test_surat[temp])

In [102]:
gSearch_temp_bkk = sarimax_randomsearch(temp_train_bkk, temp_test_bkk, pdq, pdqs, y_val=temp_valid_bkk, n_rand=5, verbose=1)
display(gSearch_temp_bkk)

100%|██████████| 16675/16675 [11:38<00:00, 23.89it/s]


ITER#0  RMSE:2.599983281567849


100%|██████████| 16675/16675 [13:43<00:00, 20.25it/s]


ITER#0  RMSE:2.321497724273218


100%|██████████| 16675/16675 [13:51<00:00, 20.05it/s]


ITER#0  RMSE:2.1328514989179173


100%|██████████| 16675/16675 [14:46<00:00, 18.80it/s]

ITER#0  RMSE:2.240611058713268





In [103]:
display(gSearch_temp_bkk)

Unnamed: 0,pdq,pdqs,rmse
0,"(3, 1, 2)","(0, 1, 0, 1461)",1.801819
2,"(4, 0, 1)","(1, 0, 0, 1461)",2.770823
1,"(6, 1, 2)","(1, 0, 1, 1461)",2.839233
3,"(4, 0, 3)","(1, 0, 1, 1461)",3.039645


In [104]:
gSearch_temp_cnx = sarimax_randomsearch(temp_train_cnx, temp_test_cnx, pdq, pdqs, y_val=temp_valid_bkk, n_rand=5, verbose=1)
display(gSearch_temp_cnx)

100%|██████████| 16675/16675 [14:59<00:00, 18.54it/s]


ITER#0  RMSE:5.476901712043151


100%|██████████| 16675/16675 [14:55<00:00, 18.62it/s]


ITER#0  RMSE:5.4198102510466635


100%|██████████| 16675/16675 [14:54<00:00, 18.64it/s]

ITER#0  RMSE:3.5400890873706032





Unnamed: 0,pdq,pdqs,rmse
1,"(1, 0, 6)","(0, 1, 1, 1461)",1.798262
0,"(1, 0, 4)","(1, 1, 1, 1461)",1.806863
2,"(3, 0, 2)","(1, 1, 1, 1461)",2.001401


In [116]:
gSearch_temp_kkc = sarimax_randomsearch(temp_train_kkc, temp_test_kkc, pdq, pdqs, y_val=temp_valid_bkk, n_rand=5, verbose=1)
display(gSearch_temp_kkc)

In [None]:
gSearch_temp_sara = sarimax_randomsearch(temp_train_sara, temp_test_sara, pdq, pdqs, y_val=temp_valid_bkk, n_rand=5, verbose=1)
display(gSearch_temp_sara)

In [None]:
gSearch_temp_surat = sarimax_randomsearch(temp_train_surat, temp_test_surat, pdq, pdqs, y_val=temp_valid_bkk, n_rand=5, verbose=1)
display(gSearch_temp_surat)

In [None]:
with open('answer_dataset/gSearch_temp_bkk_03.bin',"wb") as f:
    pickle.dump(gSearch_temp_bkk, f)

with open('answer_dataset/gSearch_temp_cnx_03.bin',"wb") as f:
    pickle.dump(gSearch_temp_cnx, f)
    
with open('answer_dataset/gSearch_temp_ray_03.bin',"wb") as f:
    pickle.dump(gSearch_temp_bkk, f)

with open('answer_dataset/gSearch_temp_sara_03.bin',"wb") as f:
    pickle.dump(gSearch_temp_sara, f)

with open('answer_dataset/gSearch_temp_surat_03.bin',"wb") as f:
    pickle.dump(gSearch_temp_surat, f)

In [None]:
temp_order = {province[0]: gSearch_temp_bkk.pdq, province[1]: gSearch_temp_cnx.pdq, province[2]: gSearch_temp_kkc.pdq, province[3]: gSearch_temp_ray.pdq, province[4]: gSearch_temp_sara.pdq, province[5]: gSearch_temp_surat.pdq}
temp_seasonal_order = {province[0]: gSearch_temp_bkk.pdqs, province[1]: gSearch_temp_cnx.pdqs, province[2]: gSearch_temp_kkc.pdqs, province[3]: gSearch_temp_ray.pdqs, province[4]: gSearch_temp_sara.pdqs, province[5]: gSearch_temp_surat.pdqs}
print(temp_order)
print(temp_seasonal_order)

## Tuning parameters for windspeed

In [None]:
windSpeed = exog_columns[1]
windSpeed_train_bkk = pd.DataFrame(exo_train_bkk[windSpeed])
windSpeed_valid_bkk = pd.DataFrame(exo_valid_bkk[windSpeed])
windSpeed_test_bkk = pd.DataFrame(exo_test_bkk[windSpeed])

windSpeed_train_cnx = pd.DataFrame(exo_train_cnx[windSpeed])
windSpeed_valid_cnx = pd.DataFrame(exo_valid_cnx[windSpeed])
windSpeed_test_cnx = pd.DataFrame(exo_test_cnx[windSpeed])

windSpeed_train_kkc = pd.DataFrame(exo_train_kkc[windSpeed])
windSpeed_valid_kkc = pd.DataFrame(exo_valid_kkc[windSpeed])
windSpeed_test_kkc = pd.DataFrame(exo_test_kkc[windSpeed])

windSpeed_train_ray = pd.DataFrame(exo_train_ray[windSpeed])
windSpeed_valid_ray = pd.DataFrame(exo_valid_ray[windSpeed])
windSpeed_test_ray = pd.DataFrame(exo_test_ray[windSpeed])

windSpeed_train_sara = pd.DataFrame(exo_train_sara[windSpeed])
windSpeed_valid_sara = pd.DataFrame(exo_valid_sara[windSpeed])
windSpeed_test_sara = pd.DataFrame(exo_test_sara[windSpeed])

windSpeed_train_surat = pd.DataFrame(exo_train_surat[windSpeed])
windSpeed_valid_surat = pd.DataFrame(exo_valid_surat[windSpeed])
windSpeed_test_surat = pd.DataFrame(exo_test_surat[windSpeed])

In [None]:
gSearch_windSpeed_bkk = sarimax_randomsearch(windSpeed_train_bkk, windSpeed_test_bkk, pdq, pdqs, y_val=windSpeed_valid_bkk)
gSearch_windSpeed_cnx = sarimax_randomsearch(windSpeed_train_cnx, windSpeed_test_cnx, pdq, pdqs, y_val=windSpeed_valid_cnx)
gSearch_windSpeed_kkc = sarimax_randomsearch(windSpeed_train_kkc, windSpeed_test_kkc, pdq, pdqs, y_val=windSpeed_valid_kkc)
gSearch_windSpeed_ray = sarimax_randomsearch(windSpeed_train_ray, windSpeed_test_ray, pdq, pdqs, y_val=windSpeed_valid_ray)
gSearch_windSpeed_sara = sarimax_randomsearch(windSpeed_train_sara, windSpeed_test_sara, pdq, pdqs, y_val=windSpeed_valid_sara)
gSearch_windSpeed_surat = sarimax_randomsearch(windSpeed_train_surat, windSpeed_test_surat, pdq, pdqs, y_val=windSpeed_valid_surat)

In [None]:
windSpeed_order = {province[0]: gSearch_windSpeed_bkk.pdq, province[1]: gSearch_windSpeed_cnx.pdq, province[2]: gSearch_windSpeed_kkc.pdq, province[3]: gSearch_windSpeed_ray.pdq, province[4]: gSearch_windSpeed_sara.pdq, province[5]: gSearch_windSpeed_surat.pdq}
windSpeed_seasonal_order = {province[0]: gSearch_windSpeed_bkk.pdqs, province[1]: gSearch_windSpeed_cnx.pdqs, province[2]: gSearch_windSpeed_kkc.pdqs, province[3]: gSearch_windSpeed_ray.pdqs, province[4]: gSearch_windSpeed_sara.pdqs, province[5]: gSearch_windSpeed_surat.pdqs}
print(windSpeed_order)
print(windSpeed_seasonal_order)

## Tuning parameters for wind direction

In [None]:
windDir = exog_columns[2]
windDir_train_bkk = pd.DataFrame(exo_train_bkk[windDir])
windDir_test_bkk = pd.DataFrame(exo_test_bkk[windDir])

windDir_train_cnx = pd.DataFrame(exo_train_cnx[windDir])
windDir_test_cnx = pd.DataFrame(exo_test_cnx[windDir])

windDir_train_kkc = pd.DataFrame(exo_train_kkc[windDir])
windDir_test_kkc = pd.DataFrame(exo_test_kkc[windDir])

windDir_train_ray = pd.DataFrame(exo_train_ray[windDir])
windDir_test_ray = pd.DataFrame(exo_test_ray[windDir])

windDir_train_sara = pd.DataFrame(exo_train_sara[windDir])
windDir_test_sara = pd.DataFrame(exo_test_sara[windDir])

windDir_train_surat = pd.DataFrame(exo_train_surat[windDir])
windDir_test_surat = pd.DataFrame(exo_test_surat[windDir])

In [None]:
gSearch_windDir_bkk = sarimax_randomsearch(windDir_train_bkk, windDir_test_bkk, pdq, pdqs)
gSearch_windDir_cnx = sarimax_randomsearch(windDir_train_cnx, windDir_test_cnx, pdq, pdqs)
gSearch_windDir_kkc = sarimax_randomsearch(windDir_train_kkc, windDir_test_kkc, pdq, pdqs)
gSearch_windDir_ray = sarimax_randomsearch(windDir_train_ray, windDir_test_ray, pdq, pdqs)
gSearch_windDir_sara = sarimax_randomsearch(windDir_train_sara, windDir_test_sara, pdq, pdqs)
gSearch_windDir_surat = sarimax_randomsearch(windDir_train_surat, windDir_test_surat, pdq, pdqs)

In [None]:
windDir_order = {province[0]: gSearch_windDir_bkk.pdq, province[1]: gSearch_windDir_cnx.pdq, province[2]: gSearch_windDir_kkc.pdq, province[3]: gSearch_windDir_ray.pdq, province[4]: gSearch_windDir_sara.pdq, province[5]: gSearch_windDir_surat.pdq}
windDir_seasonal_order = {province[0]: gSearch_windDir_bkk.pdqs, province[1]: gSearch_windDir_cnx.pdqs, province[2]: gSearch_windDir_kkc.pdqs, province[3]: gSearch_windDir_ray.pdqs, province[4]: gSearch_windDir_sara.pdqs, province[5]: gSearch_windDir_surat.pdqs}
print(windDir_order)
print(windDir_seasonal_order)

## Tuning parameters for PM2.5

In [None]:
result_bkk = sarimax_randomsearch(pm_train_bkk, pm_test_bkk, pdq, pdqs, y_val=pm_valid_bkk, x_train=exo_train_bkk, x_test=exo_test_bkk, x_val=exo_valid_bkk)
result_cnx = sarimax_randomsearch(pm_train_cnx, pm_test_cnx, pdq, pdqs, y_val=pm_valid_cnx, x_train=exo_train_cnx, x_test=exo_test_cnx, x_val=exo_valid_cnx)
result_kkc = sarimax_randomsearch(pm_train_kkc, pm_test_kkc, pdq, pdqs, y_val=pm_valid_kkc, x_train=exo_train_kkc, x_test=exo_test_kkc, x_val=exo_valid_kkc)
result_ray = sarimax_randomsearch(pm_train_ray, pm_test_ray, pdq, pdqs, y_val=pm_valid_ray, x_train=exo_train_ray, x_test=exo_test_ray, x_val=exo_valid_ray)
result_sara = sarimax_randomsearch(pm_train_sara, pm_test_sara, pdq, pdqs, y_val=pm_valid_sara, x_train=exo_train_sara, x_test=exo_test_sara, x_val=exo_valid_sara)
result_surat = sarimax_randomsearch(pm_train_surat, pm_test_surat, pdq, pdqs, y_val=pm_valid_surat, x_train=exo_train_surat, x_test=exo_test_surat, x_val=exo_valid_surat)

In [None]:
with open('answer_dataset/gridsearch/result_bkk_03.bin',"wb") as f:
    pickle.dump(result_bkk, f)

with open('answer_dataset/gridsearch/result_cnx_03.bin',"wb") as f:
    pickle.dump(result_cnx, f)
    
with open('answer_dataset/gridsearch/result_ray_03.bin',"wb") as f:
    pickle.dump(result_bkk, f)

with open('answer_dataset/gridsearch/result_sara_03.bin',"wb") as f:
    pickle.dump(result_sara, f)

with open('answer_dataset/gridsearch/result_surat_03.bin',"wb") as f:
    pickle.dump(result_surat, f)

In [None]:
order = {province[0]: result_bkk.pdq, province[1]: result_cnx.pdq, province[2]: result_kkc.pdq, province[3]: result_ray.pdq, province[4]: result_sara.pdq, province[5]: result_surat.pdq}
seasonal_order = {province[0]: result_bkk.pdqs, province[1]: result_cnx.pdqs, province[2]: result_kkc.pdqs, province[3]: result_ray.pdqs, province[4]: result_sara.pdqs, province[5]: result_surat.pdqs}
print(order)
print(seasonal_order)