In [None]:
import pandas as pd
import numpy as np
from sklearn.inspection import permutation_importance
from simplify_deployment.organism import Organism
from pathlib import Path
from sklearn.linear_model import LinearRegression
import plotly.express as px

In [None]:
org = Organism.from_yaml(
    path_config=Path("/home/thomas/repos/simplify_deployment/data/data_science/lag_25_s1_config.yaml"),
    path_genome=Path("/home/thomas/repos/simplify_deployment/data/data_science/lag_25_s1_50_gen_fold_11_best_genome.yaml")
)

In [None]:
X_train = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/X_train_fold_11.parquet")
X_test = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/X_test_fold_11.parquet")
y_train = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/y_train_fold_11.parquet")
y_test = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/y_test_fold_11.parquet")

In [None]:
y_train_model, X_train_model = org.create_y_X(
    y_train,
    X_train,
)
# to create test for model we first glue all together so we don't lose any data
y_test_model, X_test_model = org.create_y_X(
    y_test,
    X_test,
)

In [None]:
model = LinearRegression()
model.fit(
    X_train_model,
    y_train_model,
)
importances_train = permutation_importance(
    estimator = model,
    X = X_train_model,
    y = y_train_model,
    scoring = "neg_root_mean_squared_error",
    n_repeats = 100
)
importances_test = permutation_importance(
    estimator = model,
    X = X_test_model,
    y = y_test_model,
    scoring = "neg_root_mean_squared_error",
    n_repeats = 100
)



In [None]:
importance_df_train = pd.DataFrame(
    {
        "variable": X_train_model.columns,
        "importance": importances_train["importances_mean"]
    }
)
importance_df_train = importance_df_train.sort_values(by = "importance", ascending = False)
importance_df_train.to_parquet(
    "/home/thomas/repos/simplify_deployment/data/data_science/importance_df_train.parquet"
)
importance_df_train

In [None]:
fig = px.bar(
    importance_df_train,
    x = "variable",
    y = "importance",
)
fig.show()

In [None]:
importance_df_test = pd.DataFrame(
    {
        "variable": X_test_model.columns,
        "importance": importances_test["importances_mean"]
    }
)
importance_df_test = importance_df_test.sort_values(by = "importance", ascending = False)
importance_df_test.to_parquet(
    "/home/thomas/repos/simplify_deployment/data/data_science/importance_df_test.parquet"
)
importance_df_test

In [None]:
fig = px.bar(
    importance_df_test,
    x = "variable",
    y = "importance",
)
fig.show()

In [None]:
predictions_list = []

In [None]:
org_s1 = Organism.from_yaml(
    path_config=Path("/home/thomas/repos/simplify_deployment/data/data_science/lag_25_s1_config.yaml"),
    path_genome=Path("/home/thomas/repos/simplify_deployment/data/data_science/lag_25_simplify_1_0.yaml"),
)
y_train_s1, X_train_s1, = org_s1.create_y_X(
    y_train,
    X_train,
)
y_test_s1, X_test_s1, = org_s1.create_y_X(
    y_test,
    X_test,
)
model = LinearRegression()
model.fit(X_train_s1, y_train_s1)
predictions_list.append(
    pd.Series(
        model.predict(X_test_s1),
        index = X_test_s1.index,
        name = "y_pred_s1_variables"
    )
)


In [None]:
model = LinearRegression()
for n_variables in range(1, importance_df_train.shape[0] + 1):
    vars = (
        importance_df_train
        .nlargest(
            n=n_variables,
            keep="all",
            columns = "importance",
        )
    )["variable"]
    X_train_n_vars = X_train_model.loc[:,vars]
    X_test_n_vars = X_test_model.loc[:,vars]
    model.fit(
        X_train_n_vars,
        y_train_model
    )
    predictions_list.append(
        pd.Series(
            model.predict(X_test_n_vars),
            index = X_test_n_vars.index,
            name = f"y_pred_{n_variables}_variables"
        )
    )
    print(f"{n_variables} variables predicted.")


In [None]:
X_train_model.loc[:,vars]

In [None]:
vars