# Decisions tree model explain
---------------

Compute features importance for GBDT models : LightGBM and XGBoost

In [None]:
import sys,os
import torch
import matplotlib.pyplot as plt
import random
import numpy as np
from researchpkg.industry_classification.config import *
import numpy as np
from researchpkg.industry_classification.dataset.sec_datamodule import *
import seaborn as sns
sns.set(font_scale=1)
from researchpkg.industry_classification.utils.sics_loader import load_sic_codes
import lightgbm
from researchpkg.industry_classification.models.utils import NN_Utils
from researchpkg.industry_classification.models.decision_trees.lgbm import LgbmForSicClassification

In [None]:
#Color map
from matplotlib.colors import LinearSegmentedColormap

# 1. Load the dataset

In [None]:
dataset_dir = os.path.join(SEC_ROOT_DATA_DIR,"count30_sic1agg_including_is_2023")
# checkpoint_path  = os.path.join(experiment_dir,"model.lgbm"
MAX_TAG_DEPTH = 5
NORMALIZATION = "local"

In [None]:
sics_ref =load_sic_codes()
sics_ref

In [None]:
from researchpkg.industry_classification.dataset.sec_gbdt_dataset import SecGBDTDataset
sic_digits = 1
sics_dict = sics_ref.set_index(f"sic").industry_title.to_dict()

#3. Load accounts index

accounts_index, registrants_index,_ = \
    SecDataset.load_index(dataset_dir,sic_digits=sic_digits)
sic_labels = list(sorted(registrants_index[f"sic{sic_digits}"].unique().tolist()))


train_dataset = SecGBDTDataset(
        dataset_dir,
        DatasetType.TRAIN,
        sic_digits=sic_digits,
        normalization_type=NORMALIZATION,
        max_tag_depth=MAX_TAG_DEPTH
    )
val_dataset = SecGBDTDataset(
    dataset_dir,
    DatasetType.VAL,
    sic_digits=sic_digits,
    normalization_type=NORMALIZATION,
    max_tag_depth=MAX_TAG_DEPTH
)
test_dataset = SecGBDTDataset(
    dataset_dir,
    DatasetType.TEST,
    sic_digits=sic_digits,
    normalization_type=NORMALIZATION,
    max_tag_depth=MAX_TAG_DEPTH
)

# 2. Model 

## 2.1. Fit the model

In [None]:
from researchpkg.industry_classification.utils.experiment_utils import ExperimentUtils
import os
import shutil
experiment_dir = "/tmp/experiment_dir"

if os.path.exists(experiment_dir):
    shutil.rmtree(experiment_dir)
    

features_name = accounts_index["tag"].tolist()

sic_reverse_index = {v: k for k, v in train_dataset.sic_id_index.items()}
labels = np.unique(train_dataset.Y).tolist()
labels = [sic_reverse_index[l] for l in labels]
sic_code_df = load_sic_codes()[["sic", "industry_title"]]
n_labels = len(labels)


In [None]:
max_depth = MAX_TAG_DEPTH
n_estimators= 100
num_leaves = 60

# max_depth=4
# num_leavers=16
# n_estimators=1

learning_rate= 0.05
n_jobs=8
boosting_type="gbdt"
seed=42
experiment_name="model_explain"
accelerator="cuda"
global_exp_name = "count30_sic1agg_including_is_2023"
normalization=NORMALIZATION

ExperimentUtils.check_global_experiment_name(global_exp_name)
dataset_dir = os.path.join(SEC_ROOT_DATA_DIR, f"{global_exp_name}")
experiments_dir = os.path.join(LOGS_DIR, f"experiments_{global_exp_name}")
#Train and test the model
X_train, Y_train = train_dataset.X, train_dataset.Y
X_val, Y_val = val_dataset.X, val_dataset.Y
X_test, Y_test = test_dataset.X, test_dataset.Y

# Ensure all y_val labels appears at least once in Y_train
all_y_train_labels = np.unique(Y_train)
index = np.isin(Y_val, all_y_train_labels)
X_val = X_val[index]
Y_val = Y_val[index]

index_test = np.isin(Y_test, all_y_train_labels)
X_test = X_test[index_test]
Y_test = Y_test[index_test]


# Compute class weights
# class_weights = SecDataset.calculate_class_weights(Y_train.tolist(),beta=0.1)
# class_weights = SecDataset.calculate_class_weights(Y_train.tolist())

# 2. Load the model.
model = LgbmForSicClassification(
    n_accounts=len(features_name),
    n_classes=n_labels,
    num_leaves=num_leaves,
    max_depth=max_depth,
    n_estimators=n_estimators,
    learning_rate=learning_rate,
    n_jobs=n_jobs,
    features_name=features_name,
    class_names=labels,
    boosting_type=boosting_type,
    seed=seed,
    class_weight="balanced",
)

    
experiment_name = f"{model.__name__}{experiment_name}_sic{sic_digits}_balanced"
experiment_name = f"{experiment_name}_scaling.{normalization}"

if max_depth:
    experiment_name = f"{experiment_name}_max_depth{max_depth}"

# 3. Trainer
experiment_dir = os.path.join(experiments_dir, experiment_name)


if not ExperimentUtils.check_experiment(experiment_dir):
    # 4. Initialize the experiment
    ExperimentUtils.initialize_experiment(
        experiment_dir,
        dataset_dir,
        model.hparams,
        training_config={
            "num_jobs": n_jobs,
            "learning_rate": learning_rate,
            "seed": seed,
            "device": accelerator,
            "ngpus": torch.cuda.device_count() if accelerator == "cuda" else 0,
        },
    )

# model.train_top_k(X_train, Y_train, X_val, Y_val, experiment_dir=experiment_dir,
#             accelerator=accelerator,top_k=3
#             )

model.train(
    X_train,
    Y_train,
    X_val,
    Y_val,
    experiment_dir=experiment_dir,
    accelerator=accelerator,
)

model.test(X_test, Y_test, experiment_dir=experiment_dir)

In [None]:
Y_val_pred = model.predict(X_val)
res_file =  os.path.join(experiment_dir,"results.yaml")
import pyaml
from yaml import Loader
res  = pyaml.yaml.load(open(res_file, 'r'),
                               Loader=Loader
                               )
res

In [None]:
def plot_confusion_matrix(all_y_true,all_y_pred,):
        # Compute and plot the confusion matrix at the end of the training.
        
        cm_plot = NN_Utils.compute_confusion_matrix(all_y_true, all_y_pred,
                                                        model.class_names)
        
        cm_plot_normalized =  NN_Utils.compute_confusion_matrix(all_y_true, all_y_pred,
                                                        model.class_names, normalize=True)
        
        

plot_confusion_matrix(Y_val, Y_val_pred)

In [None]:
# # Confusion matrix on train samples
# Y_train_pred = model.predict(X_train)
# plt.tight_layout()
# plot_confusion_matrix(Y_train, Y_train_pred)

# 3.  Compute features importance

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def plotImp(model,top_k= 20, fig_size = (40, 20), importance_type = 'split'):
    feature_imp = pd.DataFrame({'Value':model.model.booster_.feature_importance(importance_type=importance_type),'Feature':model.model.booster_.feature_name()})
    plt.figure(figsize=fig_size)
    
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:top_k])
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.show()

In [None]:
plotImp(model,top_k=20,fig_size=(15,8),importance_type='split')

In [None]:
plotImp(model,top_k=20,fig_size=(30,20),importance_type='gain')

# 4. Plot decision tree

In [None]:
# model.model.booster_.trees_to_dataframe().tree_index.nunique()

In [None]:
#!pip install graphviz
lightgbm.plot_tree(model.model.booster_, figsize=(40,20), dpi=500, show_info=['internal value'], orientation="vertical",tree_index=6)


# 5. Lime explainer

In [None]:
#!pip install lime
import lime.lime_tabular

In [None]:
X_train = train_dataset.X
Y_train = train_dataset.Y

In [None]:
accounts_index, registrants_index,_ = \
        SecDataset.load_index(dataset_dir,sic_digits=sic_digits)
labels  = sorted(registrants_index[f"sic{sic_digits}"].unique().tolist())
feature_names = accounts_index.tag.tolist()

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train ,feature_names = feature_names,class_names=labels,
                                                   training_labels=labels,
                                                   kernel_width=3)
predict_fn = lambda x: model.model.predict_proba(np.exp(x)).astype(float)

In [None]:
chosen = X_train[0]
exp = explainer.explain_instance(chosen, predict_fn,num_features=490)
# exp.show_in_notebook(show_all=False,show_predicted_value=False)

# 6. Analysis Results on Class 2

In [None]:
dataset_sic_2= SecGBDTDataset(dataset_dir, DatasetType.VAL,sic_digits=2)
train_dataset_sic_2 = SecGBDTDataset(dataset_dir, DatasetType.TRAIN,sic_digits=2)

In [None]:
#Loading sic1 index
all_sics1 = val_dataset.all_data_dict["target"]
sic1_id_index = val_dataset.sic_id_index
rev_sic1_id_index = {v:k for k,v in sic1_id_index.items()}
all_sics1= np.array([rev_sic1_id_index[i] for i in all_sics1] )


#Loadding sic2 index and ref (text)
all_sics2 = dataset_sic_2.all_data_dict["target"]
sic2_id_index = dataset_sic_2.sic_id_index
rev_sic2_id_index = {v:k for k,v in sic2_id_index.items()}
all_sics2= np.array([rev_sic2_id_index[i] for i in all_sics2] )

all_train_sics2 = train_dataset_sic_2.all_data_dict["target"]
all_train_sics2= np.array([rev_sic2_id_index[i] for i in all_train_sics2] )


sic1_ref=load_sics_ref(sic_digits=1).set_index("sic1").short_title_fr.to_dict()
sic2_ref=load_sics_ref(sic_digits=2).set_index("sic2").short_title_fr.to_dict()

## 6.1. Predictions histograms

In [None]:
## Compute the accuracy of  the model for each sic2 class
classes_correct_rate= {}
classes_pred_as_3 = {}
classes_nb_samples = {}

df_stats_2 = pd.DataFrame(columns=["sic2","recall","pred_as_3","nb_samples_val","nb_samples_train"])    

for s in np.unique(all_train_sics2[all_train_sics2//10==2]):
    index_s = all_sics2==s 
        
    pred_as3_index = Y_val_pred[index_s]==sic1_id_index[3]
    correct_index =  Y_val_pred[index_s]==Y_val[index_s]
    
    acc = np.mean(correct_index) if correct_index.sum()>0 else 0
    ratio_pred_3=   np.mean(pred_as3_index) if pred_as3_index.sum()>0 else 0
    nb_val_samples = np.sum(index_s)    
    nb_train_samples = np.sum(all_train_sics2==s)
    df_stats_2.loc[len(df_stats_2)] = [f"{s} - {sic2_ref[s]}",acc,ratio_pred_3,nb_val_samples,nb_train_samples]
    # df_stats.loc[len(df_stats)] = [s,acc,ratio_pred_3,nb_val_samples,nb_train_samples]

# sns.set(style="whitegrid", color_codes=True)
# Create a 2x2 grid of subplots

fig, axs = plt.subplots(2, 2, figsize=(20, 20))

# #Rotate x labels
for ax in axs.flat:
    ax.tick_params(axis='x', rotation=80)

# Plot 1 - correct_rate
sns.barplot(x="sic2", y="recall", data=df_stats_2, ax=axs[0, 0])
axs[0, 0].set_title("recall")

# Plot 2 - pred_as_3
sns.barplot(x="sic2", y="pred_as_3", data=df_stats_2, ax=axs[0, 1])
axs[0, 1].set_title("pred_as_3")

# Plot 3 - nb_samples_val
sns.barplot(x="sic2", y="nb_samples_val", data=df_stats_2, ax=axs[1, 0])
axs[1, 0].set_title("nb_samples_val")

# Plot 4 - nb_samples_train
sns.barplot(x="sic2", y="nb_samples_train", data=df_stats_2, ax=axs[1, 1])
axs[1, 1].set_title("nb_samples_train")

# Add some spacing between subplots
plt.tight_layout()

# Show the plots
plt.show()


In [None]:
 ### Class 3

In [None]:
## Compute the accuracy of  the model for each sic2 class
classes_correct_rate= {}
classes_pred_as_2 = {}
classes_nb_samples = {}

df_stats_3 = pd.DataFrame(columns=["sic2","recall","pred_as_2","nb_samples_val","nb_samples_train"])    

for s in np.unique(all_train_sics2[all_train_sics2//10==3]):
    index_s = all_sics2==s 
        
    pred_as2_index = Y_val_pred[index_s]==sic1_id_index[2]
    correct_index =  Y_val_pred[index_s]==Y_val[index_s]
    
    acc = np.mean(correct_index) if correct_index.sum()>0 else 0
    ratio_pred_2=   np.mean(pred_as2_index) if pred_as2_index.sum()>0 else 0
    nb_val_samples = np.sum(index_s)    
    nb_train_samples = np.sum(all_train_sics2==s)
    df_stats_3.loc[len(df_stats_3)] = [f"{s} - {sic2_ref[s]}",acc,ratio_pred_2,nb_val_samples,nb_train_samples]
    # df_stats.loc[len(df_stats)] = [s,acc,ratio_pred_3,nb_val_samples,nb_train_samples]

# sns.set(style="whitegrid", color_codes=True)
# Create a 2x2 grid of subplots

fig, axs = plt.subplots(2, 2, figsize=(20, 20))

# #Rotate x labels
for ax in axs.flat:
    ax.tick_params(axis='x', rotation=80)

# Plot 1 - correct_rate
sns.barplot(x="sic2", y="recall", data=df_stats_3, ax=axs[0, 0])
axs[0, 0].set_title("recall")

# Plot 2 - pred_as_3
sns.barplot(x="sic2", y="pred_as_2", data=df_stats_3, ax=axs[0, 1])
axs[0, 1].set_title("pred_as_2")

# Plot 3 - nb_samples_val
sns.barplot(x="sic2", y="nb_samples_val", data=df_stats_3, ax=axs[1, 0])
axs[1, 0].set_title("nb_samples_val")

# Plot 4 - nb_samples_train
sns.barplot(x="sic2", y="nb_samples_train", data=df_stats_3, ax=axs[1, 1])
axs[1, 1].set_title("nb_samples_train")

# Add some spacing between subplots
plt.tight_layout()

# Show the plots
plt.show()


## 6.2 Shapley values
---------------------------
- 22 : Textile Mills
- 25 : Furniture & Fixtures

In [None]:
target_sic1_classes = [2,3]
tartet_sic2_classes = [22,25]

index_2 = all_sics1==2
index_3 =  all_sics1==3

### 6.2.1 Instance Shap values

In [None]:
import shap

In [None]:
print("sic1_ref :",sic1_ref)

In [None]:
sics_ref[["sic1","short_title_fr"]].set_index("sic1")

In [None]:
import numpy as np
import shap
import random

def generate_shap_force_plot(explainer, samples, target_id, feature_names):
    shap_values = explainer(samples)
    expected_value = explainer.expected_value[target_id]
    
    row_idx = random.randint(0, samples.shape[0] - 1)
    shap_value = shap_values[:,:,target_id].values[row_idx]
    shap.initjs()
    plot =shap.force_plot(base_value=expected_value,
                           shap_values=shap_value,
                           features=samples[row_idx, :],
                           feature_names=feature_names,
                           link="logit",
                           figsize=(10, 10))
    display(plot)

In [None]:
#1. Samples  25 Ok (good predictions)
all_samples_25_ok = X_val[(all_sics2==25) & (Y_val_pred==sic1_id_index[2])]
explainer = shap.TreeExplainer(model.model)
shap_values_25_ok = explainer(all_samples_25_ok)

generate_shap_force_plot(explainer, all_samples_25_ok, sic1_id_index[2], feature_names)
generate_shap_force_plot(explainer, all_samples_25_ok, sic1_id_index[3], feature_names)

In [None]:
#2. Samples  25 KO (misclassifications)
all_samples_25_ko = X_val[(all_sics2==25) & (Y_val_pred==sic1_id_index[3])]
explainer = shap.TreeExplainer(model.model)
shap_values_25_ko = explainer(all_samples_25_ko)

generate_shap_force_plot(explainer, all_samples_25_ko, sic1_id_index[2], feature_names)
generate_shap_force_plot(explainer, all_samples_25_ko, sic1_id_index[3], feature_names)

### 6.2.2. Global Average shap values

In [None]:
cmap_colors = [(1,0,0),(1,1,1),(0.,0.,1)]
cmap_name = 'cmap'
cmap = LinearSegmentedColormap.from_list(cmap_name, cmap_colors, N=256)

In [None]:
df = pd.DataFrame(columns=feature_names, data=X_val)
df["sic1_true"]=[rev_sic1_id_index[y] for y in Y_val]
df["sic1_pred"]= [rev_sic1_id_index[y] for y in Y_val_pred]
df["sic2"]= all_sics2
def plot_average_shap_values(index, sort_column):
    """
    Display average shapvalues for a specific index.
    """
    explainer = shap.TreeExplainer(model.model)
    value_samples = df.loc[index][feature_names]
    shap_values = explainer.shap_values(value_samples)
    mean_shap = {}
    for sic1 in [2,3]:
        target_id = sic1_id_index[sic1]
        mean_shap[sic1] = shap_values[target_id].mean(axis=0)

    df_tags_shap = pd.DataFrame({"Tag":feature_names, "shap_sic1_2":mean_shap[2],
                                 
                                    "shap_sic1_3":mean_shap[3]})

    df_stats = value_samples.describe()[feature_names].transpose()[["mean","std","min","max","count","50%"]]
    df_stats.index.name="Tag"
    df_tags_shap = pd.merge(df_stats,df_tags_shap, on="Tag")
    df_tags_shap.sort_values(by=sort_column,ascending=False, key=abs,inplace=True)
    
    df_tags_shap = df_tags_shap.style\
    .background_gradient(cmap, subset="shap_sic1_2")\
    .background_gradient(cmap, subset="shap_sic1_3")\
    .background_gradient(cmap, subset="mean")

    display(df_tags_shap)
    return df_tags_shap

In [None]:
#1.Class 2 shap values
plot_average_shap_values(df.sic1_pred==2 , sort_column="shap_sic1_2")

In [None]:
#2.Class 3 shap values
plot_average_shap_values(df.sic1_pred==3 , sort_column="shap_sic1_3")

In [None]:
#3.Class 3 ok
plot_average_shap_values((df.sic1_true==3)&(df.sic1_pred==3) , sort_column="shap_sic1_2")

In [None]:
#3.Class 2 ok
plot_average_shap_values((df.sic1_true==2)&(df.sic1_pred==2) , sort_column="shap_sic1_2")

In [None]:
#4. Class 2 ko
plot_average_shap_values((df.sic1_true==2)&(df.sic1_pred!=2) , sort_column="shap_sic1_2").background_gradient(cmap, subset="shap_sic1_3")

In [None]:
##5. Class 25 
plot_average_shap_values(df.sic2==25 , sort_column="shap_sic1_2").background_gradient(cmap, subset="shap_sic1_2")

In [None]:
##6. Class 25 ok
plot_average_shap_values((df.sic2==25)&(df.sic1_pred==2) , sort_column="shap_sic1_2").background_gradient(cmap, subset="shap_sic1_2")

In [None]:
##6. Class 25 ko
plot_average_shap_values((df.sic2==25)&(df.sic1_pred!=2) , sort_column="shap_sic1_2").background_gradient(cmap, subset="shap_sic1_2")

### 6.3. Tags statistics Box plot (like in sec_data_analysis.ipynb)

In [None]:
import seaborn.objects as so
import matplotlib as mpl
from ipywidgets import interactive,fixed, interact_manual,interact, FloatSlider, SelectMultiple

In [None]:
# Tags ordering according to Lightbm features importance plot.(Top 20 TAGS)
MOST_IMPORTANT_TAGS = ["InventoryNet",
              "CommonStockValue",
              "PropertyPlantAndEquipmentNet",
              "Goodwill",
              "TreasuryStockValue",
              "AccumulatedOtherComprehensiveIncomeLossNetOfTax",
              "AdditionalPaidInCapital",
              "AccountsReceivableNetCurrent",
              "IntangibleAssetsNetExcludingGoodwill",
              "AccountsPayableCurrent",
              "Liabilities",
              "DeferredRevenueCurrent",
              "EmployeeRelatedLiabilitiesCurrent",
              "AllowanceForDoubtfulAccountsReceivableCurrent",
              "MinorityInterest",
              "RetainedEarningsAccumulatedDeficit",
              # "AccumulatedDepreciationDepletionAndAmortizationPropertyPlanAndEquipment",
              "AssetsCurrent",
              "PropertyPlandAndEquipementGross"]
              
              

In [None]:
def is_outlier(points, pct=90):
    """
    Returns a boolean array with True if points are outliers and False 
    otherwise.

    Parameters:
    -----------
        points : An numobservations by numdimensions array of observations
        pct : The percentage threshold used to determine whether a point is an outlier.
            Points with a value below the specified percentage will be considered outliers.

    Returns:
    --------
        mask : A numobservations-length boolean array.
    """# Calculate the 90th percentile of the data
    q90 = np.percentile(points.abs(), pct)

    # Create a boolean mask indicating which values fall above the 90th percentile
    mask = points.abs() >= q90
    return mask
    
def show_singletag_stats(df,  tag, pct_outlier=0.95,whis=1.5, sic1_whitelist = [2,3],
                         sic2_whitelist=[22,25]):
    """
     Display the values distribution of a given account(tag)
    :param samples  : A np.array of samples filtered 
    :paran account_name :  The name of the , tagaccount whose distribution will be displayed
    """
    df =df[df.tag==tag]
    all_df =[]
    for sic1 in sic1_whitelist:
        sub_df = df[df.sic1_true==sic1].copy()
        
        sub_df_ok = sub_df[sub_df.sic1_true==sub_df.sic1_pred].copy()
        sub_df_ko = sub_df[sub_df.sic1_true!=sub_df.sic1_pred].copy()

        sub_df.rename(columns={"sic1_true":"sic"},inplace=True)

        sub_df_ok["sic"] = sub_df_ok["sic1_true"].apply(lambda x: f"{x}_ok")
        sub_df_ko["sic"] = sub_df_ko["sic1_true"].apply(lambda x: f"{x}_ko")

        all_df.append(sub_df)
        all_df.append(sub_df_ok)
        all_df.append(sub_df_ko)

    for sic2 in sic2_whitelist:
        sub_df = df[df.sic2==sic2].copy()
        
        sub_df_ok = sub_df[sub_df.sic1_true==sub_df.sic1_pred].copy()
        sub_df_ko = sub_df[sub_df.sic1_true!=sub_df.sic1_pred].copy()

        sub_df.rename(columns={"sic2":"sic"},inplace=True)

        sub_df_ok["sic"] = sub_df_ok["sic2"].apply(lambda x: f"{x}_ok")
        sub_df_ko["sic"] = sub_df_ko["sic2"].apply(lambda x: f"{x}_ko")
        all_df.append(sub_df)
        all_df.append(sub_df_ok)
        all_df.append(sub_df_ko)

    df = pd.concat(all_df)


    fig = mpl.figure.Figure(figsize=(30, 20), dpi=100, layout="constrained")
    sf1, sf2  = fig.subfigures(1, 2)

    # 1. Dot splots
    so.Plot(df, x="net_change") \
        .add(so.Bars(), so.Hist())\
        .facet(row=f"sic")\
        .share(x=False)\
        .share(y=False)\
        .on(sf1).plot()
        
    
    ax = sf2.add_axes([0.1,0.1,0.8,0.8])
    sns.boxplot(x=f"sic", 
                     y="net_change",
                     hue="sic",
                     ax=ax,
                     whis=whis,
                     data=df[(~is_outlier(df.net_change,pct=pct_outlier))],
                     # log_scale=True,
                    palette="tab10",
                    )
    display(fig)
    display(f"**{tag}")

def show_tag_box_plot(df):
    ##First unpivot
    df = df.melt(value_vars=feature_names,value_name="net_change",var_name="tag",id_vars=["sic1_pred","sic1_true","sic2"])
    
    def view_fn(tag, pct_outlier,whis=1.5, sic1_whitelist=[],sic2_whitelist=[]):
        return show_singletag_stats(df,tag=tag,pct_outlier=pct_outlier,whis=whis
                                    ,sic1_whitelist=sic1_whitelist
                                    ,sic2_whitelist=sic2_whitelist)
        
                            
    interact(view_fn, tag=MOST_IMPORTANT_TAGS,
             sic_digit=[2],
             pct_outlier= [95, 90, 80,100],
             whis=FloatSlider(min=1,max=5, step=0.5, value=1.5),
              sic1_whitelist= SelectMultiple(
                    options=sorted(df.sic1_true.unique().tolist()),
                    value=[2,3],
                        #rows=10,
                        description='Sic2',
                    disabled=False),
              sic2_whitelist= SelectMultiple(
                    options=sorted(df.sic2.unique().tolist()),
                    value=[22,25,33,36],
                    description='Sic2',
                    disabled=False
                ))

In [None]:
show_tag_box_plot(df)
# df_melted = df.melt(value_vars=feature_names,value_name="net_change",var_name="tag",id_vars=["sic1_pred","sic1_true","sic2"])
# for tag in MOST_IMPORTANT_TAGS :
#     show_singletag_stats(df_melted, tag, sic1_whitelist = [2,3],
#                          sic2_whitelist=[22,25])

In [None]:
def compare_features_summary(samples_1, samples_2, feature_name):
    df1 = pd.DataFrame(columns=feature_names, data=samples_1)
    df2 = pd.DataFrame(columns=feature_names, data=samples_2)
    
    df_desc= pd.concat([df1.describe()[[feature_name]],df2.describe()[[f25,feature_name]]],axis=1)
    display(df_desc)

    df1["set"]=1
    df2["set"]=2

    df_all=pd.concat( [df1[[feature_name,"set"]], df2[[feature_name,"set"]]])
    sns.histplot(x=feature_name, data=df_all, hue="set")
    
