In [19]:
import pandas as pd
import numpy as np
from sklearn.inspection import permutation_importance
from simplify_deployment.organism import Organism
from pathlib import Path
from sklearn.linear_model import LinearRegression
import plotly.express as px
from functools import reduce
from sklearn.metrics import mean_squared_error

In [20]:
fold = 1

In [21]:
org = Organism.from_yaml(
    path_config=Path("/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/lag_25_s1_config.yaml"),
    path_genome=Path(f"/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/best_genome/lag_25_s1_50_gen_fold_{fold}_best_genome.yaml")
)

In [22]:
X_train = pd.read_parquet(f"/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/folds/X_train_fold_{fold}.parquet")
X_test = pd.read_parquet(f"/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/folds/X_test_fold_{fold}.parquet")
y_train = pd.read_parquet(f"/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/folds/y_train_fold_{fold}.parquet")
y_test = pd.read_parquet(f"/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/folds/y_test_fold_{fold}.parquet")

In [23]:
y_train_model, X_train_model = org.create_y_X(
    y_train,
    X_train,
)
# to create test for model we first glue all together so we don't lose any data
y_test_model, X_test_model = org.create_y_X(
    y_test,
    X_test,
)
y_test_model = y_test_model.rename("y_true")

In [24]:
model = LinearRegression()
model.fit(
    X_train_model,
    y_train_model,
)
importances_train = permutation_importance(
    estimator = model,
    X = X_train_model,
    y = y_train_model,
    scoring = "neg_root_mean_squared_error",
    n_repeats = 100
)
importances_test = permutation_importance(
    estimator = model,
    X = X_test_model,
    y = y_test_model,
    scoring = "neg_root_mean_squared_error",
    n_repeats = 100
)



In [25]:
importance_df_train = pd.DataFrame(
    {
        "variable": X_train_model.columns,
        "importance": importances_train["importances_mean"]
    }
)
importance_df_train = importance_df_train.sort_values(by = "importance", ascending = False)
importance_df_train

Unnamed: 0,variable,importance
18,dsO_ID_MW_lag_10,5.169124e+02
19,dsO_ID_MW_lag_25,4.177875e+02
0,siCumulative_lag_25,3.093869e+02
1,siCumulative_lag_26,2.004231e+02
24,xB_ID_MW_lag_25,1.802449e+02
...,...,...
35,siCumulative_band_pass_1_0_h_lag_1952,5.961457e-05
75,siCumulative_second_derivative_lag_213,5.862118e-05
70,siCumulative_first_derivative_lag_1520,3.319611e-05
80,siCumulative_second_derivative_lag_1597,3.185804e-06


In [26]:
fig = px.bar(
    importance_df_train,
    x = "variable",
    y = "importance",
)
fig.show()

In [27]:
importance_df_test = pd.DataFrame(
    {
        "variable": X_test_model.columns,
        "importance": importances_test["importances_mean"]
    }
)
importance_df_test = importance_df_test.sort_values(by = "importance", ascending = False)
importance_df_test

Unnamed: 0,variable,importance
18,dsO_ID_MW_lag_10,575.953494
19,dsO_ID_MW_lag_25,475.406452
0,siCumulative_lag_25,245.516914
1,siCumulative_lag_26,162.674335
24,xB_ID_MW_lag_25,156.977158
...,...,...
69,siCumulative_first_derivative_lag_45,-0.027315
63,siCumulative_high_pass_1_0_h_lag_825,-0.027566
29,siCumulative_band_pass_24_0_h_lag_2691,-0.034093
8,siCumulative_lag_693,-0.039883


In [28]:
fig = px.bar(
    importance_df_test,
    x = "variable",
    y = "importance",
)
fig.show()

In [29]:
predictions_list = []

In [30]:
org_s1 = Organism.from_yaml(
    path_config=Path("/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/lag_25_s1_config.yaml"),
    path_genome=Path("/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/lag_25_simplify_1_0.yaml"),
)
y_train_s1, X_train_s1, = org_s1.create_y_X(
    y_train,
    X_train,
)
y_test_s1, X_test_s1, = org_s1.create_y_X(
    y_test,
    X_test,
)
model = LinearRegression()
model.fit(X_train_s1, y_train_s1)
predictions_list.append(
    pd.Series(
        model.predict(X_test_s1),
        index = X_test_s1.index,
        name = "y_pred_s1_variables"
    )
)


In [31]:
model = LinearRegression()
for n_variables in range(1, importance_df_train.shape[0] + 1):
    vars = (
        importance_df_train
        .nlargest(
            n=n_variables,
            keep="all",
            columns = "importance",
        )
    )["variable"]
    X_train_n_vars = X_train_model.loc[:,vars]
    X_test_n_vars = X_test_model.loc[:,vars]
    model.fit(
        X_train_n_vars,
        y_train_model
    )
    predictions_list.append(
        pd.Series(
            model.predict(X_test_n_vars),
            index = X_test_n_vars.index,
            name = f"y_pred_{n_variables}"
        )
    )
    print(f"{n_variables} variables predicted.")


1 variables predicted.
2 variables predicted.
3 variables predicted.
4 variables predicted.
5 variables predicted.
6 variables predicted.
7 variables predicted.
8 variables predicted.
9 variables predicted.
10 variables predicted.
11 variables predicted.
12 variables predicted.
13 variables predicted.
14 variables predicted.
15 variables predicted.
16 variables predicted.
17 variables predicted.
18 variables predicted.
19 variables predicted.
20 variables predicted.
21 variables predicted.
22 variables predicted.
23 variables predicted.
24 variables predicted.
25 variables predicted.
26 variables predicted.
27 variables predicted.
28 variables predicted.
29 variables predicted.
30 variables predicted.
31 variables predicted.
32 variables predicted.
33 variables predicted.
34 variables predicted.
35 variables predicted.
36 variables predicted.
37 variables predicted.
38 variables predicted.
39 variables predicted.
40 variables predicted.
41 variables predicted.
42 variables predicted.
4

In [32]:
pred_df = reduce(
    lambda a,b: pd.merge(a,b, left_index = True, right_index = True),
    [y_test_model] + predictions_list
)
pred_df

Unnamed: 0,y_true,y_pred_s1_variables,y_pred_1,y_pred_2,y_pred_3,y_pred_4,y_pred_5,y_pred_6,y_pred_7,y_pred_8,...,y_pred_75,y_pred_76,y_pred_77,y_pred_78,y_pred_79,y_pred_80,y_pred_81,y_pred_82,y_pred_83,y_pred_84
2022-12-29 22:44:00+00:00,22.845,35.952602,21.256924,14.631152,23.101569,25.649102,32.909487,29.890681,73.584561,71.144391,...,51.142661,51.216642,51.484581,51.539279,51.493582,51.429965,51.234018,51.254407,51.267510,51.261902
2022-12-29 22:59:00+00:00,174.852,86.711506,22.740301,13.554301,72.057874,63.829260,71.037318,65.973604,110.782125,108.704792,...,59.167964,59.196103,59.004268,59.003211,59.051925,59.103952,59.088464,59.098873,59.110039,59.103217
2022-12-29 23:14:00+00:00,169.275,180.102850,27.392502,-4.552078,36.485116,82.192847,89.035166,165.322527,182.304074,179.097184,...,153.078113,153.140797,153.264038,153.201076,153.279822,153.361371,153.245721,153.231115,153.238227,153.230094
2022-12-29 23:29:00+00:00,172.610,62.870989,28.852917,20.233795,54.360805,63.594883,65.685552,66.273240,86.290718,83.270836,...,101.512594,101.449259,101.340821,101.363550,101.340492,101.295258,101.201979,101.167563,101.179786,101.170872
2022-12-29 23:44:00+00:00,176.217,87.978834,30.081411,23.230562,103.945316,64.166692,66.200925,59.519832,84.980438,82.373922,...,87.259373,87.296130,87.114110,87.084615,87.038737,86.994931,86.825161,86.853455,86.861265,86.852635
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-24 21:29:00+00:00,-20.921,-63.888085,-40.955366,-29.773469,-59.348942,-59.365344,-55.820647,-51.266635,-61.879617,-62.941778,...,-90.757401,-90.798058,-90.910142,-90.810076,-90.910553,-90.969595,-90.905163,-90.869943,-90.862867,-90.867057
2023-01-24 21:44:00+00:00,-17.655,-42.203108,-41.917495,-37.533533,-70.159214,-62.112052,-58.843453,-43.334327,-44.207391,-44.861277,...,-49.577772,-49.879032,-49.930238,-49.842094,-49.921659,-49.893515,-49.882968,-49.807865,-49.817497,-49.823132
2023-01-24 21:59:00+00:00,57.908,-76.601096,-41.860088,-44.892587,-35.411350,-64.732879,-62.391116,-63.516483,-58.074329,-58.122123,...,-54.095272,-54.235008,-54.123800,-54.043389,-53.795597,-53.763904,-53.665879,-53.648009,-53.645511,-53.651589
2023-01-24 22:14:00+00:00,6.918,-41.617466,-41.233212,-48.369338,-11.344528,-32.987074,-30.657320,-37.521134,-31.002235,-31.696580,...,-56.058029,-56.142560,-56.045583,-56.112511,-56.130457,-56.104130,-55.990991,-55.988870,-55.976949,-55.984620


In [33]:
rmse_list = []
for column in pred_df.columns[1:]:
    rmse = np.sqrt(
        mean_squared_error(
            y_true=pred_df["y_true"],
            y_pred=pred_df[column],
        )
    )
    if column == "y_pred_s1_variables":
        n_vars = 28
        source = "s1"
        last_added_variable = "S1"
    else:
        n_vars = int(column.split("y_pred_")[1])
        source = "ga"
        last_added_variable = importance_df_train.iloc[n_vars-1,0]
    rmse_list.append(
        {
            "source":source,
            "n_vars": n_vars,
            "rmse":rmse,
            "last_added_variable": last_added_variable,
        }
    )
rmse_df = pd.DataFrame(
    rmse_list,
)

In [34]:
rmse_df

Unnamed: 0,source,n_vars,rmse,last_added_variable
0,s1,28,115.925440,S1
1,ga,1,135.352409,dsO_ID_MW_lag_10
2,ga,2,132.091360,dsO_ID_MW_lag_25
3,ga,3,120.297331,siCumulative_lag_25
4,ga,4,119.983746,siCumulative_lag_26
...,...,...,...,...
80,ga,80,111.391638,siCumulative_band_pass_1_0_h_lag_1952
81,ga,81,111.390450,siCumulative_second_derivative_lag_213
82,ga,82,111.390890,siCumulative_first_derivative_lag_1520
83,ga,83,111.390738,siCumulative_second_derivative_lag_1597


In [35]:
fig_rmse = px.scatter(
    rmse_df,
    x = "n_vars",
    y = "rmse",
    color = "source",
    hover_data="last_added_variable"
)
fig_rmse.show()

In [36]:
importance_df_train

Unnamed: 0,variable,importance
18,dsO_ID_MW_lag_10,5.169124e+02
19,dsO_ID_MW_lag_25,4.177875e+02
0,siCumulative_lag_25,3.093869e+02
1,siCumulative_lag_26,2.004231e+02
24,xB_ID_MW_lag_25,1.802449e+02
...,...,...
35,siCumulative_band_pass_1_0_h_lag_1952,5.961457e-05
75,siCumulative_second_derivative_lag_213,5.862118e-05
70,siCumulative_first_derivative_lag_1520,3.319611e-05
80,siCumulative_second_derivative_lag_1597,3.185804e-06
