In [129]:

from pathlib import Path
import numpy as np
import pandas as pd
from simplify_deployment.data_wrangling import create_target, create_X
from simplify_deployment.genetic_algorithm import genetic_algorithm
from simplify_deployment.organism import Organism
import logging
import plotly.express as px
from sklearn.linear_model import LinearRegression

In [130]:
# Create logger
logger = logging.Logger(__name__)
logger.setLevel(logging.INFO)

# Create handler
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)

# Add handler to logger
logger.addHandler(handler)

In [131]:
# Paths
path_config = Path(
    "/home/thomas/repos/simplify_deployment/src/simplify_deployment/config/lag_25.yaml",
)
path_train_minute = Path(
    "/home/thomas/repos/simplify_deployment/data/simplify_1_0/minute_data_train.parquet",
)
path_train_quarter = Path(
    "/home/thomas/repos/simplify_deployment/data/simplify_1_0/quarter_data_train.parquet",
)
path_test_minute = Path(
    "/home/thomas/repos/simplify_deployment/data/simplify_1_0/minute_data_test.parquet",
)
path_test_quarter = Path(
    "/home/thomas/repos/simplify_deployment/data/simplify_1_0/quarter_data_test.parquet",
)
path_s1_genome = Path(
    "/home/thomas/repos/simplify_deployment/src/simplify_deployment/genomes/lag_25_simplify_1_0.yaml",
)
path_best_genome = Path(
    "/home/thomas/repos/simplify_deployment/src/simplify_deployment/genomes/lag_25_best_genome_for_s1_train.yaml",
)

In [132]:
# Train data
X_minute_train = create_X(
    path_train_minute,
    path_train_quarter,
)

y_minute_train = create_target(
    X_minute_train,
)
# Test data
X_minute_test = create_X(
    path_test_minute,
    path_test_quarter,
)
y_minute_test = create_target(
    X_minute_test,
)

# Load s1 and best org
org_s1 = Organism.from_yaml(
    path_config,
    path_s1_genome,
)

org_best = Organism.from_yaml(
    path_config,
    path_best_genome

)



In [133]:
# Calculate fitnesses in loop on train to estimate test
list_s1 = []
list_best = []
for _ in range(1):
    org_best.calculate_fitness(
        y_minute_train,
        X_minute_train
    )
    org_s1.calculate_fitness(
        y_minute_train,
        X_minute_train
    )
    list_best.append(org_best.fitness)
    list_s1.append(org_s1.fitness)

train_df = pd.DataFrame(
    {"s1_rmse": list_s1,
     "best_rmse": list_best}
)
train_df 

Unnamed: 0,s1_rmse,best_rmse
0,-122.644023,-115.275712


In [134]:
fig = px.histogram(
    train_df.melt(),
    x="value",
    color = "variable",
    barmode="overlay",
)
fig.show()

In [135]:
# S1
y_train_s1, X_train_s1 = org_s1.create_y_X(
    y_minute_train,
    X_minute_train,
)
y_test_s1, X_test_s1 = org_s1.create_y_X(
    y_minute_test, 
    X_minute_test,
)

# Best
y_train_best, X_train_best = org_best.create_y_X(
    y_minute_train,
    X_minute_train,
)
y_test_best, X_test_best = org_best.create_y_X(
    y_minute_test,
    X_minute_test,
)

In [153]:
model_s1 = LinearRegression()
model_s1.fit(
    X_train_s1,
    y_train_s1
)
predictions_s1_train = pd.DataFrame(
    {
        "prediction_s1": model_s1.predict(X_train_s1),
        "real": y_train_s1,
    }
)
predictions_s1_test = pd.DataFrame(
    {
        "prediction_s1": model_s1.predict(X_test_s1),
        "real": y_test_s1,
    }
)




In [154]:
model_best = LinearRegression()
model_best.fit(
    X_train_best,
    y_train_best
)

predictions_best_train = pd.DataFrame(
    {
        "prediction_best": model_best.predict(X_train_best),
        "real": y_train_best,
    }
)
predictions_best_test = pd.DataFrame(
    {
        "prediction_best": model_best.predict(X_test_best),
        "real": y_test_best,
    }
)


In [156]:
all_predictions_train = predictions_best_train.merge(predictions_s1_train.drop(columns = "real"), left_index = True, right_index = True)
all_predictions_test = predictions_best_test.merge(predictions_s1_test.drop(columns = "real"), left_index = True, right_index = True)
all_predictions_train

Unnamed: 0_level_0,prediction_best,real,prediction_s1
utc_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-10-14 21:29:00+00:00,156.165464,248.163,96.186189
2023-10-14 21:44:00+00:00,155.729622,156.797,150.209404
2023-10-14 21:59:00+00:00,116.511405,207.210,38.618272
2023-10-14 22:14:00+00:00,118.986229,-45.593,97.418210
2023-10-14 22:29:00+00:00,59.356919,21.326,0.076356
...,...,...,...
2024-02-01 21:14:00+00:00,50.905075,126.626,23.750165
2024-02-01 21:29:00+00:00,56.810191,50.675,49.412623
2024-02-01 21:44:00+00:00,26.293741,29.372,16.050823
2024-02-01 21:59:00+00:00,-28.896084,-13.321,-47.146607


In [157]:
all_predictions_test

Unnamed: 0_level_0,prediction_best,real,prediction_s1
utc_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-02-03 21:29:00+00:00,33.031160,-49.952,43.865409
2024-02-03 21:44:00+00:00,-35.903649,68.454,-48.542346
2024-02-03 21:59:00+00:00,7.579558,107.928,9.662721
2024-02-03 22:14:00+00:00,43.572445,-77.840,43.788059
2024-02-03 22:29:00+00:00,39.941211,-81.064,39.001145
...,...,...,...
2024-02-29 21:14:00+00:00,-34.836811,-99.608,-44.532238
2024-02-29 21:29:00+00:00,-11.524790,-88.399,-14.790303
2024-02-29 21:44:00+00:00,-98.338700,-2.630,-85.441260
2024-02-29 21:59:00+00:00,-20.111800,-102.969,11.013908


In [160]:
predictions_train_molten = all_predictions_train.melt(id_vars = "real", ignore_index=False)
predictions_test_molten = all_predictions_test.melt(id_vars = "real", ignore_index=False)
predictions_train_molten

Unnamed: 0_level_0,real,variable,value
utc_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-10-14 21:29:00+00:00,248.163,prediction_best,156.165464
2023-10-14 21:44:00+00:00,156.797,prediction_best,155.729622
2023-10-14 21:59:00+00:00,207.210,prediction_best,116.511405
2023-10-14 22:14:00+00:00,-45.593,prediction_best,118.986229
2023-10-14 22:29:00+00:00,21.326,prediction_best,59.356919
...,...,...,...
2024-02-01 21:14:00+00:00,126.626,prediction_s1,23.750165
2024-02-01 21:29:00+00:00,50.675,prediction_s1,49.412623
2024-02-01 21:44:00+00:00,29.372,prediction_s1,16.050823
2024-02-01 21:59:00+00:00,-13.321,prediction_s1,-47.146607


In [161]:
def calculate_rmse(df):
    rmse = np.sqrt(np.mean((df["real"] - df["value"])**2))
    return rmse
predictions_train_molten.groupby("variable").apply(calculate_rmse, include_groups = True)





variable
prediction_best    113.963074
prediction_s1      117.654182
dtype: float64

In [162]:
predictions_test_molten.groupby("variable").apply(calculate_rmse, include_groups = True)





variable
prediction_best    107.782985
prediction_s1      109.705427
dtype: float64