In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pandas as pd



In [None]:
data = pd.read_csv("Preprocessed_Solar_Power_Data.csv")

lagged_cols = {f"lag_{lag}": data["TOTALDEMAND"].shift(lag) for lag in range(1, 96)}
lagged_df = pd.DataFrame(lagged_cols, index=data.index)

data = pd.concat([data, lagged_df], axis=1)

data = data.dropna().reset_index(drop=True)

feature_cols = [
    "T2M", "T2MDEW", "T2MWET", "RH2M", "PS", "WS2M",
    "Basel Wind Gust", "Basel Wind Direction [10 m]",
    "Basel Precipitation Total", "Basel Wind Speed [10 m]",
    "Basel Cloud Cover Total",
    "HOUR_SIN", "HOUR_COS", "WEEKDAY_SIN", "WEEKDAY_COS",
] + [f"lag_{i}" for i in range(1, 96)]# + [
#    "WEEKDAY_0", "WEEKDAY_1", "WEEKDAY_2", "WEEKDAY_3",
#    "WEEKDAY_4", "WEEKDAY_5", "WEEKDAY_6"
#]

# Keep only existing columns
feature_cols = [c for c in feature_cols if c in data.columns]

X = data[feature_cols]
y = data["ALLSKY_SFC_SW_DWN"]

##X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
tscv = TimeSeriesSplit(n_splits=5)

regr = RandomForestRegressor(
    #Optmized for TOTALDEMAND
#    n_estimators=10,
 #   max_depth=None,
  #  min_samples_split=10,
   # min_samples_leaf=1,
    #max_features=10,
    #bootstrap=True,
    #random_state=42
    #Optimized for ALLSKY_SFC_SW_DWN
    n_estimators=50,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=1,
    max_features="log2",
    bootstrap=True,
    random_state=42
)
################### HYPERPARAMETER TESTING ZONE! #######################################################################
""" param_dist = {
    'max_features': ["sqrt","log2", 1, 5, 10],
    'n_estimators': [10, 20, 50, 100, 200],
    'max_depth': [None, 1, 5, 10, 100],
    'min_samples_leaf': [1, 5, 10, 100],
    'min_samples_split': [1, 5, 10, 100],
}

random_search = RandomizedSearchCV(
    estimator=regr,
    param_distributions=param_dist,
    n_iter=20,             # number of random combinations to try
    cv=tscv,               # your TimeSeriesSplit
    scoring='neg_mean_squared_error', # minimize MSE
    n_jobs=-1,             # parallelize
    verbose=2,
    random_state=42
)
# Fit
random_search.fit(X, y)

# Best parameters & score
print("Best parameters:", random_search.best_params_)
print("Best CV score (MSE):", -random_search.best_score_) """
#####################################################################################################################


for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)


    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"fold: {fold + 1}")
    print(f"MSE:  {mse:.4f}")
    print(f"MAE:  {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R2:   {r2:.4f}")
    print("\n")

fold: 1
MSE:  10725.6766
MAE:  60.9188
RMSE: 103.5648
R2:   0.8629


fold: 2
MSE:  10093.9406
MAE:  58.5595
RMSE: 100.4686
R2:   0.8616


fold: 3
MSE:  10361.7915
MAE:  57.0009
RMSE: 101.7929
R2:   0.8587


fold: 4
MSE:  8433.1516
MAE:  53.9079
RMSE: 91.8322
R2:   0.8812


fold: 5
MSE:  9177.7477
MAE:  55.6888
RMSE: 95.8006
R2:   0.8884


