In [98]:
import pandas as pd
import numpy as np
from simplify_deployment.organism import Organism
from pathlib import Path
from simplify_deployment.data_wrangling import create_target, create_X
from sklearn.linear_model import LinearRegression

In [99]:
# Train
X_minute_train = create_X(
    Path(
       "/home/thomas/repos/simplify_deployment/data/simplify_1_0/minute_data_train.parquet", 
    ),
    Path(
        "/home/thomas/repos/simplify_deployment/data/simplify_1_0/quarter_data_train.parquet",
    ),
)
y_minute_train = create_target(
    X_minute_train,

)

# Test
X_minute_test = create_X(
   Path(
       "/home/thomas/repos/simplify_deployment/data/simplify_1_0/minute_data_test.parquet", 
    ),
    Path(
        "/home/thomas/repos/simplify_deployment/data/simplify_1_0/quarter_data_test.parquet",
    ), 
)
y_minute_test = create_target(
    X_minute_test,
)

In [100]:
org_si_only = Organism.from_yaml(
    path_genome=Path(
        "/home/thomas/repos/simplify_deployment/src/simplify_deployment/data_science/genomes/lag_25_best_genome_only_si.yaml",
    ),
    path_config=Path(
        "/home/thomas/repos/simplify_deployment/src/simplify_deployment/data_science/config/lag_25.yaml",
    ),
)
org_si_only.calculate_fitness(
    y_minute_train,
    X_minute_train,
)
print(f"Train fitness is {org_si_only.fitness}")

Train fitness is -121.35110947113462


In [101]:
y_model_train, X_model_train = org_si_only.create_y_X(
    y_minute_train, X_minute_train,
)
y_model_test,  X_model_test = org_si_only.create_y_X(
    y_minute_test, X_minute_test,
)

In [102]:
model = LinearRegression()
model.fit(X_model_train, y_model_train)
predictions_train = pd.DataFrame(
    {
        "y_true": y_model_train,
        "y_pred": model.predict(X_model_train)
    },
    index = y_model_train.index
)
predictions_train["residuals"] = predictions_train["y_true"] - predictions_train["y_pred"]
predictions_train.index = predictions_train.index.floor("15min")
# predictions_train.to_parquet("/home/thomas/repos/simplify_deployment/src/simplify_deployment/data_science/data/residuals_train.parquet")
predictions_train

Unnamed: 0_level_0,y_true,y_pred,residuals
utc_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-10-14 22:30:00+00:00,-164.409,42.836815,-207.245815
2023-10-14 22:45:00+00:00,-18.675,-7.833485,-10.841515
2023-10-14 23:00:00+00:00,128.203,-26.860779,155.063779
2023-10-14 23:15:00+00:00,213.423,-1.900903,215.323903
2023-10-14 23:30:00+00:00,-59.949,116.032032,-175.981032
...,...,...,...
2024-02-01 21:30:00+00:00,29.372,65.186245,-35.814245
2024-02-01 21:45:00+00:00,-13.321,26.714592,-40.035592
2024-02-01 22:00:00+00:00,137.455,0.277723,137.177277
2024-02-01 22:15:00+00:00,266.041,28.852995,237.188005


In [103]:
predictions_test = pd.DataFrame(
    {
        "y_true": y_model_test,
        "y_pred": model.predict(X_model_test)
    },
    index = y_model_test.index
)
predictions_test["residuals"] = predictions_test["y_true"] - predictions_test["y_pred"]
predictions_test.index = predictions_test.index.floor("15min")
# predictions_test.to_parquet("/home/thomas/repos/simplify_deployment/src/simplify_deployment/data_science/data/residuals_test.parquet")
predictions_test

Unnamed: 0_level_0,y_true,y_pred,residuals
utc_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-02-03 22:30:00+00:00,-35.649,-49.709099,14.060099
2024-02-03 22:45:00+00:00,-2.498,5.614185,-8.112185
2024-02-03 23:00:00+00:00,68.240,-26.580893,94.820893
2024-02-03 23:15:00+00:00,131.478,-9.076675,140.554675
2024-02-03 23:30:00+00:00,145.381,57.536276,87.844724
...,...,...,...
2024-02-29 21:30:00+00:00,-2.630,-87.183120,84.553120
2024-02-29 21:45:00+00:00,-102.969,-0.026005,-102.942995
2024-02-29 22:00:00+00:00,-312.698,6.958920,-319.656920
2024-02-29 22:15:00+00:00,-157.582,-157.500287,-0.081713


In [104]:
qh_train = pd.read_parquet(
    "/home/thomas/repos/simplify_deployment/data/simplify_1_0/quarter_data_train.parquet",
)
qh_test = pd.read_parquet(
    "/home/thomas/repos/simplify_deployment/data/simplify_1_0/quarter_data_test.parquet",
)
qh_train = qh_train.merge(
    predictions_train["residuals"],
    left_index=True,
    right_index=True,
)
qh_test = qh_test.merge(
    predictions_test["residuals"],
    left_index=True,
    right_index=True,
)

In [105]:
qh_train.to_parquet("/home/thomas/repos/simplify_deployment/src/simplify_deployment/data_science/data/qh_data_train_residuals.parquet")

In [106]:
qh_test.to_parquet("/home/thomas/repos/simplify_deployment/src/simplify_deployment/data_science/data/qh_data_test_residuals.parquet")