In [None]:
## In-depth caracterization of single variants

In [None]:
import shap
import polars as pl
from src.ncboost_functions import get_feature_list
from src.ncboost_functions import load_models

l_path = '/data-cbl/bcaron/NCBoost/ncboost_dev'

shap.initjs()

model_name = 'ncboost_models'

model_path = f'{l_path}/models/{model_name}'
model_dict = {}
models = load_models(model_path)


A, B, C, D = get_feature_list()
variables = ['chr', 'pos', 'ref', 'alt', 'region', 'closest_gene_name', 'closest_gene_ENSG', 
             'label', 'partition', 'matching_index', 'CADD', 'ReMM']
region_list = ['upstream', 'downstream', 'intronic', 'UTR5', 'UTR3', 'intergenic']
features = A + B + C + D + region_list

variants =  pl.read_csv(source = f'{l_path}/data/testing/testing_data_scored.tsv', 
                   separator = '\t',
                   null_values='NA',
                   schema_overrides={'chr': str, 'ZooUCE' : float}
                   )


In [None]:
l_variant = variants.filter(pl.col('rsid') == 'rs777661576')
l_variant.head()

In [None]:
l_model = l_variant['partition'][0]
explainer = shap.TreeExplainer(models[l_model])

dtest = variants.filter(pl.col('partition') == l_model).select(features)

In [None]:
shap_values = explainer.shap_values(dtest)
shap.summary_plot(shap_values, 
                  features=dtest, 
                  feature_names=dtest.columns, 
                  plot_type="bar", 
                  max_display=20, 
                  plot_size=[5,6]) 


In [None]:
shap.force_plot(explainer.expected_value, shap_values[10, :], dtest.to_pandas().iloc[10, :])


In [None]:
shap.summary_plot(shap_values, dtest.to_pandas())


In [None]:
# explain the model's predictions using SHAP
shap_values = explainer(dtest.to_pandas())


In [None]:
shap.plots.waterfall(shap_values[1])