In [216]:
import pandas as pd
import numpy as np
from simplify_deployment.organism import Organism
from sklearn.linear_model import LinearRegression
from simplify_deployment.data_wrangling import create_target, create_X
from pathlib import Path
from simplify_deployment.genetic_algorithm import genetic_algorithm
from sklearn.metrics import mean_squared_error

In [217]:
# Train data
X_train_si_1 = create_X(
    "/home/thomas/repos/simplify_deployment/data/simplify_1_0/minute_data_train.parquet",
    "/home/thomas/repos/simplify_deployment/data/simplify_1_0/quarter_data_train.parquet",
)
y_train_si_1 = create_target(
    X_train_si_1
)
# Train data
X_test_si_1 = create_X(
    "/home/thomas/repos/simplify_deployment/data/simplify_1_0/minute_data_test.parquet",
    "/home/thomas/repos/simplify_deployment/data/simplify_1_0/quarter_data_test.parquet",
)
y_test_si_1 = create_target(
    X_test_si_1
)

# Performance of Full model

In [218]:
org_full = Organism.from_yaml(
    "/home/thomas/repos/simplify_deployment/src/simplify_deployment/config/lag_25.yaml",
    "/home/thomas/repos/simplify_deployment/src/simplify_deployment/genomes/lag_25_best_genome_for_s1_train.yaml"
)
y_train_full, X_train_full = org_full.create_y_X(
    y_train_si_1,
    X_train_si_1,
)
y_test_full, X_test_full = org_full.create_y_X(
    y_test_si_1,
    X_test_si_1,
)

In [219]:
model_full = LinearRegression()
model_full.fit(
    X_train_full,
    y_train_full,
)
predictions_train_full = pd.DataFrame(
    {   
        "y_true": y_train_full,
        "y_pred_full": model_full.predict(
            X_train_full,
        )
    },
    index = y_train_full.index
)
predictions_test_full = pd.DataFrame(
    {   
        "y_true": y_test_full,
        "y_pred_full": model_full.predict(
            X_test_full,
        )
    },
    index = y_test_full.index
)


## Train si only model

In [220]:
path_config_si_only = Path(
    "/home/thomas/repos/simplify_deployment/src/simplify_deployment/config/lag_25_si_only.yaml",
)
org = Organism(
    path_config=path_config_si_only
)
org._init_empty_genome()
n_vars = org.get_n_variables_possible()
org_si_only = genetic_algorithm(
    path_config=path_config_si_only,
    chance_of_random_variable_to_be_in_organism=1 / n_vars,
    mutation_chance=1 / n_vars,
    n_generations=25,
    n_untouched=1,
    number_of_deaths=50,
    population_size=200,
    reproduction_chance_second_over_first=0.85,
    X=X_train_si_1,
    y=y_train_si_1,
    extra_organisms=[
        "/home/thomas/repos/simplify_deployment/src/simplify_deployment/genomes/lag_25_latest_4_si.yaml",
        "/home/thomas/repos/simplify_deployment/src/simplify_deployment/genomes/lag_25_si_only.yaml",
    ],
    path_best_genome="/home/thomas/repos/simplify_deployment/src/simplify_deployment/genomes/lag_25_si_only.yaml",
)



Generation 0:
Best fitness: -127.6
Best organism used 4 variables
The variables used were:


       variable  lag  selected
0  siCumulative   25      True
1  siCumulative   26      True
2  siCumulative   27      True
3  siCumulative   28      True

Generation 1:
Best fitness: -126.3
Best organism used 3 variables
The variables used were:


       variable  lag  selected
0  siCumulative   25      True
1  siCumulative   26      True
3  siCumulative   28      True

Generation 2:
Best fitness: -123.6
Best organism used 7 variables
The variables used were:


       variable  lag  selected
0  siCumulative   25      True
1  siCumulative   26      True


           variable     filter  filter_period_hours   lag  selected
8043   siCumulative  band_pass                 1.00  2356      True
10905  siCumulative  band_pass                 0.25  2362      True
15260  siCumulative   low_pass                12.00  1005      True
16461  siCumulative   low_pass                12.00  2206      True
2003

#### Predict on train and test set

In [None]:
y_train_si_only, X_train_si_only = org_si_only.create_y_X(
    y_train_si_1,
    X_train_si_1,
)
y_test_si_only, X_test_si_only = org_si_only.create_y_X(
    y_test_si_1,
    X_test_si_1,
)


In [None]:
si_only_model = LinearRegression()
si_only_model.fit(
    X_train_si_only, 
    y_train_si_only,
)
# Predictions on train set
predictions_train = pd.DataFrame(
    {   
        "y_true": y_train_si_only,
        "y_pred_si_only": si_only_model.predict(
            X_train_si_only,
        )
    },
    index = y_train_si_only.index
)
predictions_train["residuals_si_only"] = predictions_train["y_true"] - predictions_train["y_pred_si_only"]
si_only_rmse_train = np.sqrt(
    mean_squared_error(
        predictions_train["y_true"],
        predictions_train["y_pred_si_only"],
    )
)
print(f"Si only train rmse: {si_only_rmse_train}")
# Predictions on test set
predictions_test = pd.DataFrame(
    {   
        "y_true": y_test_si_only,
        "y_pred_si_only": si_only_model.predict(
            X_test_si_only,
        )
    },
    index = y_test_si_only.index
)
predictions_test["residuals_si_only"] = predictions_test["y_true"] - predictions_test["y_pred_si_only"]
si_only_rmse_test = np.sqrt(
    mean_squared_error(
        predictions_test["y_true"],
        predictions_test["y_pred_si_only"],
    )
)
print(f"Si only test rmse: {si_only_rmse_test}")

Si only train rmse: 127.0881121601537
Si only test rmse: 117.84995909675463


In [None]:
predictions_train

Unnamed: 0_level_0,y_true,y_pred_si_only,residuals_si_only
utc_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-10-14 08:14:00+00:00,92.580,58.570312,34.009688
2023-10-14 08:29:00+00:00,269.991,41.949656,228.041344
2023-10-14 08:44:00+00:00,273.360,106.237622,167.122378
2023-10-14 08:59:00+00:00,262.520,126.810072,135.709928
2023-10-14 09:14:00+00:00,187.240,102.030455,85.209545
...,...,...,...
2024-02-01 21:44:00+00:00,29.372,45.155629,-15.783629
2024-02-01 21:59:00+00:00,-13.321,8.901384,-22.222384
2024-02-01 22:14:00+00:00,137.455,-5.442724,142.897724
2024-02-01 22:29:00+00:00,266.041,25.391147,240.649853


## Residuals model

In [None]:
X_train_si_1 = X_train_si_1.merge(
    (
        predictions_train["residuals_si_only"]
        .resample("1min")
        .bfill()
    ),
    left_index=True,
    right_index=True
)
X_test_si_1 = X_test_si_1.merge(
    (
        predictions_test["residuals_si_only"]
        .resample("1min")
        .bfill()
    ),
    left_index=True,
    right_index=True
)

In [None]:
path_config_residuals = Path(
    "/home/thomas/repos/simplify_deployment/src/simplify_deployment/config/lag_25_residuals.yaml",
)
org = Organism(
    path_config=path_config_residuals
)
org._init_empty_genome()
n_vars = org.get_n_variables_possible()
org_residuals = genetic_algorithm(
    path_config=path_config_residuals,
    chance_of_random_variable_to_be_in_organism=1 / n_vars,
    mutation_chance=1 / n_vars,
    n_generations=25,
    n_untouched=1,
    number_of_deaths=50,
    population_size=200,
    reproduction_chance_second_over_first=0.85,
    X=X_train_si_1,
    y=predictions_train["residuals_si_only"],
    extra_organisms=[
        "/home/thomas/repos/simplify_deployment/src/simplify_deployment/genomes/lag_25_residuals_custom.yaml",
        "/home/thomas/repos/simplify_deployment/src/simplify_deployment/genomes/lag_25_residuals_best.yaml",
    ],
    path_best_genome="/home/thomas/repos/simplify_deployment/src/simplify_deployment/genomes/lag_25_residuals_best.yaml",
)


Generation 0:
Best fitness: -121.1
Best organism used 24 variables
The variables used were:


     variable  lag  selected
2  loaD_ID_MW  -20      True
3  loaD_ID_MW   -5      True
4  loaD_ID_MW   10      True
5  loaD_ID_MW   25      True
6  loaD_ID_MW   40      True
7  loaD_ID_MW   55      True
8  loaD_ID_MW   70      True
9  loaD_ID_MW   85      True
2   dsO_ID_MW  -20      True
3   dsO_ID_MW   -5      True
4   dsO_ID_MW   10      True
5   dsO_ID_MW   25      True
6   dsO_ID_MW   40      True
7   dsO_ID_MW   55      True
8   dsO_ID_MW   70      True
9   dsO_ID_MW   85      True
2    xB_ID_MW  -20      True
3    xB_ID_MW   -5      True
4    xB_ID_MW   10      True
5    xB_ID_MW   25      True
6    xB_ID_MW   40      True
7    xB_ID_MW   55      True
8    xB_ID_MW   70      True
9    xB_ID_MW   85      True

Generation 1:
Best fitness: -117.7
Best organism used 24 variables
The variables used were:


     variable  lag  selected
2  loaD_ID_MW  -20      True
3  loaD_ID_MW   -5      Tru

#### Predict on train and test set




In [None]:
y_train_residuals, X_train_residuals = org_residuals.create_y_X(
    predictions_train["residuals_si_only"],
    X_train_si_1,
)
y_test_residuals, X_test_residuals = org_residuals.create_y_X(
    predictions_test["residuals_si_only"],
    X_test_si_1,
)


In [None]:
residuals_model = LinearRegression()
residuals_model.fit(
    X_train_residuals, 
    y_train_residuals,
)
# Predictions on train set
predictions_train_residuals = pd.DataFrame(
    {   
        "y_true_residuals": y_train_residuals,
        "y_pred_residuals": residuals_model.predict(
            X_train_residuals,
        )
    },
    index = y_train_residuals.index
)
predictions_test_residuals = pd.DataFrame(
    {   
        "y_true_residuals": y_test_residuals,
        "y_pred_residuals": residuals_model.predict(
            X_test_residuals,
        )
    },
    index = y_test_residuals.index
)



## Compare

In [None]:
# Train
predictions_train = predictions_train.merge(
    predictions_train_residuals.drop(columns = "y_true_residuals"),
    left_index = True,
    right_index = True,
)
predictions_train = predictions_train.merge(
    predictions_train_full.drop(columns = "y_true"),
    left_index = True,
    right_index = True,
)
predictions_train["y_pred_final"] = predictions_train["y_pred_si_only"] + predictions_train["y_pred_residuals"]

# Test
predictions_test = predictions_test.merge(
    predictions_test_residuals.drop(columns = "y_true_residuals"),
    left_index = True,
    right_index = True,
)
predictions_test = predictions_test.merge(
    predictions_test_full.drop(columns = "y_true"),
    left_index = True,
    right_index = True,
)
predictions_test["y_pred_final"] = predictions_test["y_pred_si_only"] + predictions_test["y_pred_residuals"]


Rmse on train set for combined prediction: 118.39840068776357
Rmse on test set for combined prediction: 110.14823730660534


In [None]:
rmse_train_final = np.sqrt(
    mean_squared_error(
        predictions_train["y_true"],
        predictions_train["y_pred_final"]
    )
)
rmse_train_full = np.sqrt(
    mean_squared_error(
        predictions_train["y_true"],
        predictions_train["y_pred_full"]
    )
)
print(f"Rmse on train set for full prediction at once: {rmse_train_full}")
print(f"Rmse on train set for combined prediction: {rmse_train_final}")


rmse_test_final = np.sqrt(
    mean_squared_error(
        predictions_test["y_true"],
        predictions_test["y_pred_final"]
    )
)
rmse_test_full = np.sqrt(
    mean_squared_error(
        predictions_test["y_true"],
        predictions_test["y_pred_full"]
    )
)
print(f"Rmse on test set for full prediction at once: {rmse_test_full}")
print(f"Rmse on test set for combined prediction: {rmse_test_final}")

Rmse on train set for full prediction at once: 113.9630740716065
Rmse on train set for combined prediction: 118.39840068776357
Rmse on test set for full prediction at once: 107.78298484160906
Rmse on test set for combined prediction: 110.14823730660534


In [None]:
predictions_train

Unnamed: 0_level_0,y_true,y_pred_si_only,residuals_si_only,y_pred_residuals,y_pred_full,y_pred_final
utc_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-10-14 21:29:00+00:00,248.163,133.834052,114.328948,-4.668636,156.165464,129.165417
2023-10-14 21:44:00+00:00,156.797,152.399148,4.397852,2.206407,155.729622,154.605555
2023-10-14 21:59:00+00:00,207.210,11.773905,195.436095,31.576409,116.511405,43.350314
2023-10-14 22:14:00+00:00,-45.593,105.667835,-151.260835,-17.730054,118.986229,87.937780
2023-10-14 22:29:00+00:00,21.326,-8.143989,29.469989,17.305270,59.356919,9.161281
...,...,...,...,...,...,...
2024-02-01 21:14:00+00:00,126.626,72.656349,53.969651,-15.307293,50.905075,57.349055
2024-02-01 21:29:00+00:00,50.675,75.614752,-24.939752,7.125436,56.810191,82.740188
2024-02-01 21:44:00+00:00,29.372,45.155629,-15.783629,-23.397242,26.293741,21.758387
2024-02-01 21:59:00+00:00,-13.321,8.901384,-22.222384,-25.082595,-28.896084,-16.181211


In [None]:
predictions_train

Unnamed: 0_level_0,y_true,y_pred_si_only,residuals_si_only,y_pred_residuals,y_pred_full,y_pred_final
utc_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-10-14 21:29:00+00:00,248.163,133.834052,114.328948,-4.668636,156.165464,129.165417
2023-10-14 21:44:00+00:00,156.797,152.399148,4.397852,2.206407,155.729622,154.605555
2023-10-14 21:59:00+00:00,207.210,11.773905,195.436095,31.576409,116.511405,43.350314
2023-10-14 22:14:00+00:00,-45.593,105.667835,-151.260835,-17.730054,118.986229,87.937780
2023-10-14 22:29:00+00:00,21.326,-8.143989,29.469989,17.305270,59.356919,9.161281
...,...,...,...,...,...,...
2024-02-01 21:14:00+00:00,126.626,72.656349,53.969651,-15.307293,50.905075,57.349055
2024-02-01 21:29:00+00:00,50.675,75.614752,-24.939752,7.125436,56.810191,82.740188
2024-02-01 21:44:00+00:00,29.372,45.155629,-15.783629,-23.397242,26.293741,21.758387
2024-02-01 21:59:00+00:00,-13.321,8.901384,-22.222384,-25.082595,-28.896084,-16.181211
