In [120]:
import pandas as pd
import numpy as np
from sklearn.inspection import permutation_importance
from simplify_deployment.organism import Organism
from pathlib import Path
from sklearn.linear_model import LinearRegression
import plotly.express as px
from functools import reduce
from sklearn.metrics import mean_squared_error
import plotly.figure_factory as ff


In [121]:
importance_test_dict = {}
importance_train_dict = {}
for fold in range(12):
    print(f"Starting fold {fold}")
    org = Organism.from_yaml(
        path_config=Path("/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/lag_25_s1_config.yaml"),
        path_genome=Path(f"/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/best_genome/lag_25_s1_50_gen_fold_{fold}_best_genome.yaml")
    )
    X_train = pd.read_parquet(f"/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/folds/X_train_fold_{fold}.parquet")
    X_test = pd.read_parquet(f"/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/folds/X_test_fold_{fold}.parquet")
    y_train = pd.read_parquet(f"/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/folds/y_train_fold_{fold}.parquet")
    y_test = pd.read_parquet(f"/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/folds/y_test_fold_{fold}.parquet")
    y_train_model, X_train_model = org.create_y_X(
        y_train,
        X_train,
    )
    # Create train and test for the model with best vars
    y_test_model, X_test_model = org.create_y_X(
        y_test,
        X_test,
    )
    y_test_model = y_test_model.rename("y_true")
    model = LinearRegression()
    model.fit(
        X_train_model,
        y_train_model,
    )
    # Calculate importances train
    importances_train = permutation_importance(
        estimator = model,
        X = X_train_model,
        y = y_train_model,
        scoring = "neg_root_mean_squared_error",
        n_repeats = 10
    )
    importance_df_train = pd.DataFrame(
        {
            "variable": X_train_model.columns,
            "importance": importances_train["importances_mean"]
        }
    )
    importance_df_train = importance_df_train.sort_values(by = "importance", ascending = False)
    importance_train_dict[fold] = importance_df_train

    # Calculate importances test
    importances_test = permutation_importance(
        estimator = model,
        X = X_test_model,
        y = y_test_model,
        scoring = "neg_root_mean_squared_error",
        n_repeats = 10
    )
    importance_df_test = pd.DataFrame(
        {
            "variable": X_test_model.columns,
            "importance": importances_test["importances_mean"]
        }
    )
    importance_df_test = importance_df_test.sort_values(by = "importance", ascending = False)
    importance_test_dict[fold] = importance_df_test
    print(f"Fold {fold} done.")

Starting fold 0
Fold 0 done.
Starting fold 1
Fold 1 done.
Starting fold 2
Fold 2 done.
Starting fold 3
Fold 3 done.
Starting fold 4
Fold 4 done.
Starting fold 5
Fold 5 done.
Starting fold 6
Fold 6 done.
Starting fold 7
Fold 7 done.
Starting fold 8
Fold 8 done.
Starting fold 9
Fold 9 done.
Starting fold 10
Fold 10 done.
Starting fold 11
Fold 11 done.


In [122]:
importance_sets_train = [set(importance_train_dict[i]["variable"]) for i in range(12)]
set.intersection(*importance_sets_train)

{'dsO_ID_MW_lag_-20',
 'dsO_ID_MW_lag_10',
 'dsO_ID_MW_lag_25',
 'siCumulative_lag_25',
 'siCumulative_lag_26',
 'xB_ID_MW_lag_-20',
 'xB_ID_MW_lag_10',
 'xB_ID_MW_lag_25'}

In [123]:
importance_sets_test = [set(importance_test_dict[i]["variable"]) for i in range(12)]
set.intersection(*importance_sets_test)

{'dsO_ID_MW_lag_-20',
 'dsO_ID_MW_lag_10',
 'dsO_ID_MW_lag_25',
 'siCumulative_lag_25',
 'siCumulative_lag_26',
 'xB_ID_MW_lag_-20',
 'xB_ID_MW_lag_10',
 'xB_ID_MW_lag_25'}

In [124]:
top = pd.concat([importance_test_dict[i] for i in range(12)], axis = 0)
top = top.groupby("variable").size().to_frame(name = "count")
top = top.sort_values(by = "variable").reset_index()
top["variable"] = top["variable"].astype(str)
top[["variable","lag"]] = top["variable"].str.split("_lag_",expand = True)
top = top[["variable","lag","count"]]
top["lag"] = top["lag"].astype("int")
top = top.sort_values(by = ["variable","lag"], ascending = [True, True])
top


Unnamed: 0,variable,lag,count
0,dsO_ID_MW,-20,12
1,dsO_ID_MW,-5,7
2,dsO_ID_MW,10,12
7,dsO_ID_MW,25,12
10,dsO_ID_MW,40,7
...,...,...,...
784,xB_ID_MW,85,5
776,xB_ID_MW,100,1
777,xB_ID_MW,145,2
779,xB_ID_MW,190,1


In [125]:
more_than_4 = top.loc[top["count"]>=4,:]
more_than_4

Unnamed: 0,variable,lag,count
0,dsO_ID_MW,-20,12
1,dsO_ID_MW,-5,7
2,dsO_ID_MW,10,12
7,dsO_ID_MW,25,12
10,dsO_ID_MW,40,7
11,dsO_ID_MW,55,8
12,dsO_ID_MW,70,10
13,dsO_ID_MW,85,8
14,loaD_ID_MW,-20,7
15,loaD_ID_MW,-5,8


In [126]:
less_than_4 = top.loc[top["count"]<4,:]
less_than_4

Unnamed: 0,variable,lag,count
3,dsO_ID_MW,100,1
5,dsO_ID_MW,220,1
8,dsO_ID_MW,250,1
9,dsO_ID_MW,295,1
4,dsO_ID_MW,1000,1
...,...,...,...
774,xB_ID_MW,-50,1
776,xB_ID_MW,100,1
777,xB_ID_MW,145,2
779,xB_ID_MW,190,1


In [127]:
var = "siCumulative_band_pass_24_0_h"
var_df = less_than_4.loc[less_than_4["variable"]=="siCumulative_band_pass_24_0_h",:]

np.random.seed(1)

var_df["pseudo"] = 0
fig = ff.create_dendrogram(var_df[["pseudo","lag"]], labels = list(var_df["lag"].astype(str)))
fig.update_layout(width=800, height=500)
fig.show()


In [128]:
var_df

Unnamed: 0,variable,lag,count,pseudo
183,siCumulative_band_pass_24_0_h,52,1,0
189,siCumulative_band_pass_24_0_h,99,1,0
152,siCumulative_band_pass_24_0_h,142,1,0
165,siCumulative_band_pass_24_0_h,222,1,0
168,siCumulative_band_pass_24_0_h,239,1,0
177,siCumulative_band_pass_24_0_h,286,1,0
178,siCumulative_band_pass_24_0_h,326,1,0
179,siCumulative_band_pass_24_0_h,346,1,0
180,siCumulative_band_pass_24_0_h,380,1,0
181,siCumulative_band_pass_24_0_h,394,1,0
