In [10]:
import pandas as pd
from simplify_deployment.organism import Organism
import plotly.express as px
from sklearn.linear_model import LinearRegression
from pathlib import Path
from tempfile import TemporaryDirectory
from simplify_deployment.data_wrangling import create_target, create_X
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
from sklearn.metrics import mean_squared_error

In [11]:
def create_datasets(
    path_to_save_folds: Path,
    path_minute_data: Path = Path("/home/thomas/repos/simplify_deployment/data/simplify_1_0/s1_minute_data.parquet"),
    path_qh_data: Path = Path("/home/thomas/repos/simplify_deployment/data/simplify_1_0/s1_quarter_data.parquet"),
    n_splits = 12,
    max_train_size = 16 * 7 * 24 * 60,
    test_size = 4 * 7 * 24 * 60,
):
    X_minute = create_X(
        path_minute_data=path_minute_data,
        path_qh_data=path_qh_data,
    )

    X_minute = X_minute.asfreq("1min").ffill()

    tscv = TimeSeriesSplit(
        n_splits=n_splits,
        max_train_size=max_train_size,
        gap=0,
        test_size=test_size,
    )

    for fold, (train_index, test_index) in enumerate(tscv.split(X_minute)):
        # Train
        X_train = X_minute.iloc[train_index]
        X_train.to_parquet(
            path_to_save_folds / f"X_train_fold_{fold}.parquet",
        )

        y_train = create_target(
            X_train,
        )
        y_train.to_frame().to_parquet(
            path_to_save_folds / f"y_train_fold_{fold}.parquet",
        )

        # Test
        X_test = X_minute.iloc[test_index]
        X_test.to_parquet(
            path_to_save_folds / f"X_test_fold_{fold}.parquet",
        )

        y_test = create_target(
            X_test,
        )
        y_test.to_frame().to_parquet(
            path_to_save_folds / f"y_test_fold_{fold}.parquet",
        )


In [12]:
rmse_dict = {
    "train_size": [],
    "rmse": [],
}
for train_size in range(7*24*60,16 * 7 * 24 * 60, 7*24*60):
    with TemporaryDirectory() as folder:
        folder = Path(folder)
        create_datasets(
            path_to_save_folds=folder,
            max_train_size=train_size,
        )
        prediction_list = []
        for fold in range(12):
            # Raw
            X_train_raw = pd.read_parquet(folder/f"X_train_fold_{fold}.parquet")
            X_test_raw = pd.read_parquet(folder/f"X_test_fold_{fold}.parquet")
            y_train_raw = pd.read_parquet(folder/f"y_train_fold_{fold}.parquet")
            y_test_raw = pd.read_parquet(folder/f"y_test_fold_{fold}.parquet")

            # Data for model
            org = Organism.from_yaml(
                path_config="/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/lag_25_s1_config.yaml",
                path_genome=f"/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/best_genome/lag_25_s1_50_gen_fold_{fold}_best_genome.yaml"
            )
            y_train, X_train = org.create_y_X(
                y_train_raw,
                X_train_raw,
            )
            y_test, X_test = org.create_y_X(
                y_test_raw,
                X_test_raw,
            )

            for hour in range(24):
                X_train_hour_n = X_train.loc[X_train.index.hour == hour,:]
                y_train_hour_n = y_train.loc[y_train.index.hour == hour]

                X_test_hour_n = X_test.loc[X_test.index.hour == hour,:]
                y_test_hour_n = y_test.loc[y_test.index.hour == hour]

                model = LinearRegression()
                model.fit(X_train_hour_n, y_train_hour_n)
                prediction_list.append(
                    pd.DataFrame(
                            {
                                "y_pred_one_model_per_hour": model.predict(X_test_hour_n),
                                "y_true": y_test_hour_n.values,
                            },
                            index = X_test_hour_n.index,
                    )
                )
                print(f"Fold {fold} hour {hour} train size {train_size} done.")
        prediction = pd.concat(
            prediction_list,
            axis = 0,
        )
        rmse = np.sqrt(mean_squared_error(y_true=prediction["y_true"], y_pred = prediction["y_pred_one_model_per_hour"]))
        print(f"For train size {train_size} the rmse is {rmse} MW.")
        rmse_dict["train_size"].append(train_size)
        rmse_dict["rmse"].append(rmse)

rmse_df = pd.DataFrame(rmse_dict)
print(rmse_df)

Fold 0 hour 0 train size 10080 done.
Fold 0 hour 1 train size 10080 done.
Fold 0 hour 2 train size 10080 done.
Fold 0 hour 3 train size 10080 done.
Fold 0 hour 4 train size 10080 done.
Fold 0 hour 5 train size 10080 done.
Fold 0 hour 6 train size 10080 done.
Fold 0 hour 7 train size 10080 done.
Fold 0 hour 8 train size 10080 done.
Fold 0 hour 9 train size 10080 done.
Fold 0 hour 10 train size 10080 done.
Fold 0 hour 11 train size 10080 done.
Fold 0 hour 12 train size 10080 done.
Fold 0 hour 13 train size 10080 done.
Fold 0 hour 14 train size 10080 done.
Fold 0 hour 15 train size 10080 done.
Fold 0 hour 16 train size 10080 done.
Fold 0 hour 17 train size 10080 done.
Fold 0 hour 18 train size 10080 done.
Fold 0 hour 19 train size 10080 done.
Fold 0 hour 20 train size 10080 done.
Fold 0 hour 21 train size 10080 done.
Fold 0 hour 22 train size 10080 done.
Fold 0 hour 23 train size 10080 done.
Fold 1 hour 0 train size 10080 done.
Fold 1 hour 1 train size 10080 done.
Fold 1 hour 2 train size

  rmse = np.sqrt(np.mean((prediction["y_true"] - prediction["y_pred_one_model_per_hour"]**2)))


Fold 0 hour 0 train size 20160 done.
Fold 0 hour 1 train size 20160 done.
Fold 0 hour 2 train size 20160 done.
Fold 0 hour 3 train size 20160 done.
Fold 0 hour 4 train size 20160 done.
Fold 0 hour 5 train size 20160 done.
Fold 0 hour 6 train size 20160 done.
Fold 0 hour 7 train size 20160 done.
Fold 0 hour 8 train size 20160 done.
Fold 0 hour 9 train size 20160 done.
Fold 0 hour 10 train size 20160 done.
Fold 0 hour 11 train size 20160 done.
Fold 0 hour 12 train size 20160 done.
Fold 0 hour 13 train size 20160 done.
Fold 0 hour 14 train size 20160 done.
Fold 0 hour 15 train size 20160 done.
Fold 0 hour 16 train size 20160 done.
Fold 0 hour 17 train size 20160 done.
Fold 0 hour 18 train size 20160 done.
Fold 0 hour 19 train size 20160 done.
Fold 0 hour 20 train size 20160 done.
Fold 0 hour 21 train size 20160 done.
Fold 0 hour 22 train size 20160 done.
Fold 0 hour 23 train size 20160 done.
Fold 1 hour 0 train size 20160 done.
Fold 1 hour 1 train size 20160 done.
Fold 1 hour 2 train size

KeyboardInterrupt: 

In [11]:
prediction = pd.concat(
    prediction_list,
    axis = 0,
)
prediction = prediction.sort_index(ascending = True)
prediction.to_parquet("/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/one_model_per_hour_24_times_more_train.parquet")

In [None]:
prediction

In [None]:
X_train

In [None]:
y_train

In [None]:
pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/one_model_per_hour.parquet")

In [None]:
fig = px.line(X_train, x = X_train.index, y = ["hour_sin", "hour_cos"])
fig.show()