In [None]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
import numpy as np
from sklearn.model_selection import train_test_split
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error
import pickle

datetime_features = ['dayofweek', 'month', 'hour', 'minute', 'is_weekend']

def get_masked_data(df: pd.DataFrame):
    # Define the start and end timestamps
    start_timestamp = pd.to_datetime('2023-01-01 08:00:00')
    end_timestamp = pd.to_datetime('2023-12-31 08:00:00')

    # Create a boolean mask for the specified time range
    mask = (df['timestamp'] >= start_timestamp) & (df['timestamp'] < end_timestamp)

    # Apply the mask to get the desired slice of the DataFrame
    result_df = df[mask]

    #filter values that are less then 0 from polygon_1663
    result_df = result_df[result_df['polygon_1663'] >= 0]

    result_df.fillna(0, inplace=True)

    return result_df


def to_input(df: pd.DataFrame):
    df['polygon_1663'].fillna(0, inplace=True)

    return df

data = pd.read_parquet("../data/rucphen_precipitation_clean.parquet")
data = get_masked_data(data)
data = to_input(data)




: 

In [4]:
# Use 'polygon_1663' as the feature of interest
data.set_index('timestamp', inplace=True)

y = data['polygon_1663']

data.head()


Unnamed: 0_level_0,polygon_1663,percentage
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01 08:00:00,0.02,7.018
2023-01-01 08:05:00,0.01,7.018
2023-01-01 08:10:00,0.0,7.018
2023-01-01 08:15:00,0.0,7.018
2023-01-01 08:20:00,0.0,7.018


In [5]:
result = adfuller(y)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])


ADF Statistic: -25.229313
p-value: 0.000000


In [6]:
split_ratio = 0.8
split_index = int(len(y) * split_ratio)

train = y[:split_index]
test = y[split_index:]


In [7]:
p = d = q = range(0, 2)  # ARIMA parameters
P = D = Q = range(0, 2)  # Seasonal parameters
s = 288  # Seasonal period for daily seasonality


In [8]:
from itertools import product

parameters = product(p, d, q, P, D, Q)
parameters_list = list(parameters)


In [9]:
import warnings
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

warnings.filterwarnings("ignore")  # Ignore convergence warnings

best_aic = float("inf")
best_params = None
best_model = None

for params in parameters_list:
    try:
        model = SARIMAX(train, order=(params[0], params[1], params[2]),
                        seasonal_order=(params[3], params[4], params[5], s)).fit(disp=False)
        aic = model.aic
        print(f"ARIMA{params} - AIC:{aic}")
        if aic < best_aic:
            best_aic = aic
            best_params = params
            best_model = model
    except:
        continue

print('Best SARIMAX{}x{}12 model - AIC:{}'.format(best_params[:3], best_params[3:], best_aic))


ARIMA(0, 0, 0, 0, 0, 0) - AIC:-190676.49989552243


In [None]:
with open('/home/minorai2/capstone/models/best_sarimax_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [None]:
predictions = best_model.predict(start=test.index[0], end=test.index[-1], dynamic=False)
mse = mean_squared_error(test, predictions)
print(f'Mean Squared Error: {mse}')
