In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
# Import the train_test_split module from sklearn
from sklearn.model_selection import train_test_split
# For z-score calculations
from scipy import stats
# Import the GridSearchCV and RandomizeSearchCV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import mplcursors
import pickle
from joblib import dump, load
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import IsolationForest

In [12]:
dataset = pd.read_csv("data/NMBUBigDatasetFinal.csv", low_memory=False)

In [13]:
dataset

Unnamed: 0,ID,date,mean_air_temperature_2m,min_air_temperature_2m,max_air_temperature_2m,relative_humidity,air_pressure_2m_mbar,precipitation_mm,evaporation_mm,earth_heat_flux_MJ_m2,radiation_balance_w_m2,phosynthetic_active_radiation_mE_m2,albedo_RR_GR,snow_depth_cm,ST2,ST5,ST10,ST20,ST50,ST100
0,1,1/1/2000,-5.0,-7.6,-3.0,100.0,1001.5,,,-0.3,-0.12,1.17,0.74,,-0.2,-0.1,0.1,0.4,1.9,3.4
1,2,1/2/2000,0.6,-5.0,4.0,88.6,997.7,,,-0.2,-2.21,1.62,0.77,,-0.1,-0.1,0.0,0.4,1.9,3.4
2,3,1/3/2000,2.1,-1.1,4.6,88.6,991.5,8.8,,-0.1,-1.80,0.71,0.39,,-0.1,-0.1,0.0,0.5,1.9,3.4
3,4,1/4/2000,1.8,-3.3,4.4,67.3,987.0,2.1,,-0.1,-4.83,2.49,0.48,,0.0,0.0,0.1,0.5,1.9,3.4
4,5,1/5/2000,0.6,-3.6,4.4,98.2,993.3,1.2,,-0.2,-0.44,1.68,0.17,,0.0,0.0,0.1,0.5,1.9,3.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8851,8852,3/28/2024,5.2,2.4,7.9,90.0,976.7,7.6,,0.6,0.45,13.34,0.21,,1.6,1.3,1.0,0.9,1.2,2.0
8852,8853,3/29/2024,5.9,4.7,7.4,93.6,978.3,2.6,,0.8,0.29,9.17,0.18,,2.9,2.6,2.1,1.7,1.5,2.1
8853,8854,3/30/2024,5.9,4.0,7.4,94.4,990.1,2.0,,0.7,0.61,9.32,0.18,,3.5,3.3,2.8,2.5,1.9,2.2
8854,8855,3/31/2024,6.2,2.4,8.0,93.6,988.1,2.9,,0.6,0.02,8.09,0.18,,4.2,4.1,3.6,3.1,2.4,2.3


In [14]:
# import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error

# Change the date column to datetime format
dataset['date'] = pd.to_datetime(dataset['date'])

# Set the datetime column as the index
dataset.set_index('date', inplace=True)

# Sort the data by the datetime index
dataset = dataset.sort_index()
dataset_time_series = dataset.copy()

# Drop the original day, month, year columns
dataset_time_series.drop(columns=[ 'ID', 'ST5','ST10','ST20','ST50','ST100'], inplace=True)

# Check for missing dates and fill them with NaN
all_dates = pd.date_range(start=dataset_time_series.index.min(), end=dataset_time_series.index.max(), freq='D')
dataset_time_series = dataset_time_series.reindex(all_dates)


# Define the target column and the number of lags
target_column = 'ST2'
lags = 10

# Function to create lagged features
def create_features(dataset_time_series, target_column, lags=1):
    for lag in range(1, lags + 1):
        dataset_time_series[f'{target_column}_lag_{lag}'] = dataset_time_series[target_column].shift(lag)
    dataset_time_series = dataset_time_series.dropna()
    return dataset_time_series

# Create lagged features
dataset_time_series = create_features(dataset_time_series, target_column, lags)

# Define the feature columns
feature_columns = [col for col in dataset_time_series.columns if col != target_column]

# Prepare the feature matrix (X) and target vector (y)
X = dataset_time_series[feature_columns]
y = dataset_time_series[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Define the models
rf = RandomForestRegressor(n_estimators=100, random_state=42)
xgb = XGBRegressor(n_estimators=100, random_state=42)
cat = CatBoostRegressor(n_estimators=100, random_state=42, verbose=0)
hgb = HistGradientBoostingRegressor(random_state=42)

# Define the Stacking Regressor
estimators = [
    ('rf', rf),
    ('xgb', xgb),
    ('cat', cat),
    ('hgb', hgb)
]

stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=100, random_state=42))

# Train and evaluate each model
models = {
    'RandomForest': rf,
    'XGBoost': xgb,
    'CatBoost': cat,
    'HistGradientBoosting': hgb,
    'StackingRegressor': stacking_regressor
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f'{name} RMSE: {rmse:.4f}')


RandomForest RMSE: 2.8611
XGBoost RMSE: 2.6709
CatBoost RMSE: 3.3112
HistGradientBoosting RMSE: 3.2730
StackingRegressor RMSE: 2.8410
