In [1]:
# Parameters
cell_type = "Mono"


In [2]:
import sys
import os

import anndata as ad

import pandas as pd
import numpy as np

from pyprojroot import here

import re

from tqdm import tqdm

In [3]:
def load_sorted_shap_values_fnames(
    cell_type: str = '',
    run_name: str = 'run1'):

    dirpath = here(f"inflammabucket_bkp/03_downstream_analysis/05_SHAP/results/04_shap/shap_vals")
    fname_regex = f'^{run_name}_{cell_type}_shap_values_' + r'(\d+)'
    results_batches = [
        (fname, int(re.search(fname_regex, fname).group(1)))
        for fname in os.listdir(dirpath)
        if re.search(fname_regex, fname)]

    sorted_files = sorted(results_batches, key=lambda x: x[1])

    sorted_filenames = [os.path.join(dirpath, filename) for filename, _ in sorted_files]

    return sorted_filenames

In [4]:
shap_val_list = [np.load(fname)['shap_values'] for fname in tqdm(load_sorted_shap_values_fnames(cell_type))] 


  0%|          | 0/105 [00:00<?, ?it/s]

  1%|          | 1/105 [00:03<06:24,  3.70s/it]

  2%|▏         | 2/105 [00:07<06:30,  3.79s/it]

  3%|▎         | 3/105 [00:11<06:28,  3.80s/it]

  4%|▍         | 4/105 [00:15<06:24,  3.81s/it]

  5%|▍         | 5/105 [00:19<06:21,  3.82s/it]

  6%|▌         | 6/105 [00:22<06:17,  3.82s/it]

  7%|▋         | 7/105 [00:26<06:14,  3.82s/it]

  8%|▊         | 8/105 [00:30<06:10,  3.82s/it]

  9%|▊         | 9/105 [00:34<06:10,  3.85s/it]

 10%|▉         | 10/105 [00:38<06:10,  3.90s/it]

 10%|█         | 11/105 [00:42<06:08,  3.92s/it]

 11%|█▏        | 12/105 [00:46<06:07,  3.95s/it]

 12%|█▏        | 13/105 [00:50<06:02,  3.94s/it]

 13%|█▎        | 14/105 [00:54<05:57,  3.93s/it]

 14%|█▍        | 15/105 [00:58<05:53,  3.93s/it]

 15%|█▌        | 16/105 [01:02<05:49,  3.93s/it]

 16%|█▌        | 17/105 [01:05<05:44,  3.92s/it]

 17%|█▋        | 18/105 [01:09<05:40,  3.91s/it]

 18%|█▊        | 19/105 [01:13<05:36,  3.92s/it]

 19%|█▉        | 20/105 [01:17<05:33,  3.92s/it]

 20%|██        | 21/105 [01:21<05:27,  3.90s/it]

 21%|██        | 22/105 [01:25<05:23,  3.90s/it]

 22%|██▏       | 23/105 [01:29<05:19,  3.89s/it]

 23%|██▎       | 24/105 [01:33<05:15,  3.90s/it]

 24%|██▍       | 25/105 [01:37<05:11,  3.89s/it]

 25%|██▍       | 26/105 [01:41<05:06,  3.89s/it]

 26%|██▌       | 27/105 [01:44<05:02,  3.88s/it]

 27%|██▋       | 28/105 [01:48<04:59,  3.88s/it]

 28%|██▊       | 29/105 [01:52<04:56,  3.90s/it]

 29%|██▊       | 30/105 [01:56<04:51,  3.89s/it]

 30%|██▉       | 31/105 [02:00<04:47,  3.89s/it]

 30%|███       | 32/105 [02:04<04:43,  3.89s/it]

 31%|███▏      | 33/105 [02:08<04:40,  3.90s/it]

 32%|███▏      | 34/105 [02:12<04:35,  3.88s/it]

 33%|███▎      | 35/105 [02:15<04:30,  3.87s/it]

 34%|███▍      | 36/105 [02:19<04:25,  3.85s/it]

 35%|███▌      | 37/105 [02:23<04:21,  3.85s/it]

 36%|███▌      | 38/105 [02:27<04:19,  3.87s/it]

 37%|███▋      | 39/105 [02:31<04:15,  3.87s/it]

 38%|███▊      | 40/105 [02:35<04:11,  3.87s/it]

 39%|███▉      | 41/105 [02:39<04:07,  3.86s/it]

 40%|████      | 42/105 [02:42<04:03,  3.86s/it]

 41%|████      | 43/105 [02:46<03:59,  3.86s/it]

 42%|████▏     | 44/105 [02:50<03:55,  3.85s/it]

 43%|████▎     | 45/105 [02:54<03:51,  3.86s/it]

 44%|████▍     | 46/105 [02:58<03:48,  3.87s/it]

 45%|████▍     | 47/105 [03:02<03:43,  3.86s/it]

 46%|████▌     | 48/105 [03:06<03:39,  3.86s/it]

 47%|████▋     | 49/105 [03:09<03:35,  3.86s/it]

 48%|████▊     | 50/105 [03:13<03:32,  3.86s/it]

 49%|████▊     | 51/105 [03:17<03:32,  3.93s/it]

 50%|████▉     | 52/105 [03:21<03:26,  3.90s/it]

 50%|█████     | 53/105 [03:25<03:22,  3.89s/it]

 51%|█████▏    | 54/105 [03:29<03:17,  3.88s/it]

 52%|█████▏    | 55/105 [03:33<03:14,  3.88s/it]

 53%|█████▎    | 56/105 [03:37<03:09,  3.86s/it]

 54%|█████▍    | 57/105 [03:41<03:04,  3.84s/it]

 55%|█████▌    | 58/105 [03:44<03:00,  3.84s/it]

 56%|█████▌    | 59/105 [03:48<02:59,  3.91s/it]

 57%|█████▋    | 60/105 [03:52<02:55,  3.89s/it]

 58%|█████▊    | 61/105 [03:56<02:50,  3.89s/it]

 59%|█████▉    | 62/105 [04:00<02:47,  3.89s/it]

 60%|██████    | 63/105 [04:04<02:42,  3.88s/it]

 61%|██████    | 64/105 [04:08<02:39,  3.88s/it]

 62%|██████▏   | 65/105 [04:12<02:35,  3.88s/it]

 63%|██████▎   | 66/105 [04:16<02:31,  3.89s/it]

 64%|██████▍   | 67/105 [04:19<02:27,  3.88s/it]

 65%|██████▍   | 68/105 [04:23<02:22,  3.86s/it]

 66%|██████▌   | 69/105 [04:27<02:18,  3.85s/it]

 67%|██████▋   | 70/105 [04:31<02:14,  3.85s/it]

 68%|██████▊   | 71/105 [04:35<02:10,  3.85s/it]

 69%|██████▊   | 72/105 [04:39<02:06,  3.84s/it]

 70%|██████▉   | 73/105 [04:42<02:03,  3.85s/it]

 70%|███████   | 74/105 [04:46<01:59,  3.84s/it]

 71%|███████▏  | 75/105 [04:50<01:55,  3.84s/it]

 72%|███████▏  | 76/105 [04:54<01:51,  3.84s/it]

 73%|███████▎  | 77/105 [04:58<01:47,  3.83s/it]

 74%|███████▍  | 78/105 [05:02<01:43,  3.82s/it]

 75%|███████▌  | 79/105 [05:05<01:39,  3.82s/it]

 76%|███████▌  | 80/105 [05:09<01:35,  3.82s/it]

 77%|███████▋  | 81/105 [05:13<01:31,  3.81s/it]

 78%|███████▊  | 82/105 [05:17<01:28,  3.83s/it]

 79%|███████▉  | 83/105 [05:21<01:24,  3.82s/it]

 80%|████████  | 84/105 [05:25<01:20,  3.84s/it]

 81%|████████  | 85/105 [05:28<01:17,  3.86s/it]

 82%|████████▏ | 86/105 [05:32<01:13,  3.87s/it]

 83%|████████▎ | 87/105 [05:36<01:09,  3.86s/it]

 84%|████████▍ | 88/105 [05:40<01:05,  3.86s/it]

 85%|████████▍ | 89/105 [05:44<01:01,  3.86s/it]

 86%|████████▌ | 90/105 [05:48<00:57,  3.86s/it]

 87%|████████▋ | 91/105 [05:52<00:54,  3.89s/it]

 88%|████████▊ | 92/105 [05:56<00:50,  3.88s/it]

 89%|████████▊ | 93/105 [05:59<00:46,  3.87s/it]

 90%|████████▉ | 94/105 [06:03<00:42,  3.88s/it]

 90%|█████████ | 95/105 [06:07<00:38,  3.90s/it]

 91%|█████████▏| 96/105 [06:11<00:35,  3.89s/it]

 92%|█████████▏| 97/105 [06:15<00:31,  3.88s/it]

 93%|█████████▎| 98/105 [06:19<00:27,  3.86s/it]

 94%|█████████▍| 99/105 [06:23<00:23,  3.87s/it]

 95%|█████████▌| 100/105 [06:27<00:19,  3.86s/it]

 96%|█████████▌| 101/105 [06:30<00:15,  3.86s/it]

 97%|█████████▋| 102/105 [06:34<00:11,  3.85s/it]

 98%|█████████▊| 103/105 [06:38<00:07,  3.84s/it]

 99%|█████████▉| 104/105 [06:42<00:03,  3.83s/it]

100%|██████████| 105/105 [06:45<00:00,  3.74s/it]

100%|██████████| 105/105 [06:45<00:00,  3.87s/it]




In [5]:
adata = ad.read_h5ad(here(f'inflammabucket_bkp/03_downstream_analysis/05_SHAP/data/{cell_type}_adataMerged_SPECTRAgenes.log1p.h5ad'), backed='r')

In [6]:
## Gene symbols and Disease labels
symbols_df = pd.read_pickle(here('inflammabucket_bkp/03_downstream_analysis/04_selected_gene_list.pkl'))
symbols_sorted = symbols_df.loc[adata.var_names].symbol.values

DISEASES = ['BRCA', 'CD', 'COPD', 'COVID', 'CRC', 'HBV', 'HIV', 'HNSCC', 'MS', 'NPC', 'PS', 'PSA', 'RA', 'SLE', 'UC', 'asthma', 'cirrhosis', 'flu', 'healthy', 'sepsis']

diseaseDict = dict()
for d in DISEASES:
    diseaseDict[d] = []

In [7]:
shape_values_matrix = np.concatenate(shap_val_list)

In [8]:
for idx, values in tqdm(adata.obs.groupby('sampleID', observed=True).indices.items()):
    geneXdisease_sample_i = pd.DataFrame(shape_values_matrix[values].mean(0))
    geneXdisease_sample_i.columns = DISEASES
    geneXdisease_sample_i.index = symbols_sorted
    for d in geneXdisease_sample_i.columns:
        diseaseDict[d].append(pd.DataFrame.from_dict({idx:geneXdisease_sample_i[d]}))

  0%|          | 0/817 [00:00<?, ?it/s]

  1%|          | 5/817 [00:00<00:17, 45.60it/s]

  1%|          | 10/817 [00:00<00:17, 46.39it/s]

  2%|▏         | 15/817 [00:00<00:18, 42.44it/s]

  2%|▏         | 20/817 [00:00<00:19, 41.57it/s]

  3%|▎         | 25/817 [00:00<00:23, 33.19it/s]

  4%|▎         | 29/817 [00:00<00:28, 27.87it/s]

  4%|▍         | 33/817 [00:01<00:30, 26.08it/s]

  4%|▍         | 36/817 [00:01<00:29, 26.57it/s]

  5%|▌         | 42/817 [00:01<00:22, 34.03it/s]

  6%|▌         | 48/817 [00:01<00:20, 37.85it/s]

  6%|▋         | 53/817 [00:01<00:22, 34.18it/s]

  7%|▋         | 58/817 [00:01<00:21, 35.96it/s]

  8%|▊         | 62/817 [00:01<00:21, 35.38it/s]

  8%|▊         | 66/817 [00:01<00:20, 35.96it/s]

  9%|▊         | 71/817 [00:02<00:20, 36.06it/s]

  9%|▉         | 76/817 [00:02<00:19, 37.74it/s]

 10%|▉         | 81/817 [00:02<00:18, 40.35it/s]

 11%|█         | 86/817 [00:02<00:17, 41.17it/s]

 11%|█▏        | 92/817 [00:02<00:16, 43.87it/s]

 13%|█▎        | 103/817 [00:02<00:11, 60.57it/s]

 14%|█▍        | 114/817 [00:02<00:09, 73.07it/s]

 15%|█▍        | 122/817 [00:02<00:09, 74.77it/s]

 17%|█▋        | 135/817 [00:02<00:07, 87.43it/s]

 18%|█▊        | 144/817 [00:03<00:08, 83.41it/s]

 19%|█▊        | 153/817 [00:03<00:09, 72.08it/s]

 20%|█▉        | 161/817 [00:03<00:09, 68.85it/s]

 21%|██        | 169/817 [00:03<00:12, 53.67it/s]

 21%|██▏       | 175/817 [00:03<00:11, 54.01it/s]

 22%|██▏       | 181/817 [00:03<00:12, 50.89it/s]

 23%|██▎       | 187/817 [00:03<00:12, 51.87it/s]

 24%|██▎       | 194/817 [00:04<00:11, 54.69it/s]

 24%|██▍       | 200/817 [00:04<00:11, 53.67it/s]

 25%|██▌       | 206/817 [00:04<00:11, 53.13it/s]

 26%|██▌       | 212/817 [00:04<00:11, 52.53it/s]

 27%|██▋       | 218/817 [00:04<00:10, 54.47it/s]

 27%|██▋       | 224/817 [00:04<00:11, 51.92it/s]

 28%|██▊       | 230/817 [00:04<00:12, 46.25it/s]

 29%|██▉       | 235/817 [00:04<00:13, 42.04it/s]

 29%|██▉       | 240/817 [00:05<00:14, 40.17it/s]

 30%|██▉       | 245/817 [00:05<00:13, 41.09it/s]

 31%|███       | 251/817 [00:05<00:12, 44.29it/s]

 31%|███▏      | 256/817 [00:05<00:12, 43.51it/s]

 32%|███▏      | 263/817 [00:05<00:11, 49.30it/s]

 33%|███▎      | 269/817 [00:05<00:10, 52.00it/s]

 34%|███▍      | 277/817 [00:05<00:09, 59.22it/s]

 35%|███▍      | 284/817 [00:05<00:08, 60.87it/s]

 36%|███▌      | 293/817 [00:05<00:07, 65.57it/s]

 37%|███▋      | 300/817 [00:06<00:09, 55.98it/s]

 37%|███▋      | 306/817 [00:06<00:10, 50.32it/s]

 38%|███▊      | 312/817 [00:06<00:09, 51.85it/s]

 39%|███▉      | 319/817 [00:06<00:08, 55.67it/s]

 40%|████      | 327/817 [00:06<00:08, 59.76it/s]

 41%|████      | 334/817 [00:06<00:08, 58.66it/s]

 42%|████▏     | 340/817 [00:06<00:08, 55.88it/s]

 42%|████▏     | 346/817 [00:06<00:08, 54.39it/s]

 43%|████▎     | 355/817 [00:07<00:08, 57.64it/s]

 44%|████▍     | 361/817 [00:07<00:07, 57.93it/s]

 45%|████▍     | 367/817 [00:07<00:09, 49.46it/s]

 46%|████▌     | 376/817 [00:07<00:07, 58.29it/s]

 47%|████▋     | 383/817 [00:07<00:08, 52.73it/s]

 48%|████▊     | 390/817 [00:07<00:07, 56.37it/s]

 48%|████▊     | 396/817 [00:07<00:08, 51.48it/s]

 49%|████▉     | 404/817 [00:08<00:07, 57.05it/s]

 50%|█████     | 410/817 [00:08<00:08, 47.76it/s]

 51%|█████▏    | 419/817 [00:08<00:07, 56.21it/s]

 52%|█████▏    | 427/817 [00:08<00:06, 60.55it/s]

 53%|█████▎    | 434/817 [00:08<00:06, 59.85it/s]

 54%|█████▍    | 442/817 [00:08<00:05, 63.85it/s]

 55%|█████▌    | 450/817 [00:08<00:05, 65.15it/s]

 56%|█████▋    | 460/817 [00:08<00:04, 71.90it/s]

 57%|█████▋    | 468/817 [00:09<00:05, 63.07it/s]

 58%|█████▊    | 475/817 [00:09<00:05, 63.39it/s]

 60%|█████▉    | 487/817 [00:09<00:04, 77.14it/s]

 61%|██████    | 496/817 [00:09<00:04, 66.91it/s]

 62%|██████▏   | 504/817 [00:09<00:05, 62.30it/s]

 63%|██████▎   | 511/817 [00:09<00:05, 59.23it/s]

 63%|██████▎   | 518/817 [00:09<00:04, 60.22it/s]

 64%|██████▍   | 525/817 [00:09<00:04, 61.27it/s]

 65%|██████▌   | 532/817 [00:10<00:04, 61.13it/s]

 66%|██████▌   | 539/817 [00:10<00:04, 57.92it/s]

 67%|██████▋   | 545/817 [00:10<00:04, 58.01it/s]

 68%|██████▊   | 554/817 [00:10<00:03, 66.41it/s]

 69%|██████▊   | 561/817 [00:10<00:03, 66.08it/s]

 70%|██████▉   | 568/817 [00:10<00:03, 67.12it/s]

 71%|███████   | 576/817 [00:10<00:03, 70.65it/s]

 72%|███████▏  | 585/817 [00:10<00:03, 72.38it/s]

 73%|███████▎  | 593/817 [00:10<00:03, 67.84it/s]

 73%|███████▎  | 600/817 [00:11<00:03, 66.78it/s]

 74%|███████▍  | 607/817 [00:11<00:03, 65.63it/s]

 75%|███████▌  | 614/817 [00:11<00:03, 63.14it/s]

 76%|███████▌  | 621/817 [00:11<00:03, 54.46it/s]

 77%|███████▋  | 628/817 [00:11<00:04, 41.60it/s]

 77%|███████▋  | 633/817 [00:12<00:07, 24.80it/s]

 78%|███████▊  | 637/817 [00:12<00:06, 25.77it/s]

 78%|███████▊  | 641/817 [00:12<00:07, 25.04it/s]

 79%|███████▉  | 646/817 [00:12<00:05, 28.58it/s]

 80%|███████▉  | 651/817 [00:12<00:05, 32.42it/s]

 80%|████████  | 655/817 [00:12<00:05, 28.87it/s]

 81%|████████  | 663/817 [00:13<00:03, 38.79it/s]

 82%|████████▏ | 670/817 [00:13<00:03, 44.53it/s]

 83%|████████▎ | 676/817 [00:13<00:03, 46.57it/s]

 83%|████████▎ | 682/817 [00:13<00:02, 48.04it/s]

 84%|████████▍ | 688/817 [00:13<00:02, 48.49it/s]

 85%|████████▌ | 695/817 [00:13<00:02, 51.90it/s]

 86%|████████▌ | 701/817 [00:13<00:02, 53.34it/s]

 87%|████████▋ | 707/817 [00:13<00:02, 53.37it/s]

 87%|████████▋ | 713/817 [00:13<00:01, 53.14it/s]

 88%|████████▊ | 719/817 [00:14<00:01, 51.10it/s]

 89%|████████▊ | 725/817 [00:14<00:01, 48.55it/s]

 89%|████████▉ | 731/817 [00:14<00:01, 51.07it/s]

 90%|█████████ | 737/817 [00:14<00:01, 50.51it/s]

 91%|█████████ | 743/817 [00:14<00:01, 52.18it/s]

 92%|█████████▏| 749/817 [00:14<00:01, 47.48it/s]

 92%|█████████▏| 754/817 [00:14<00:01, 43.24it/s]

 93%|█████████▎| 759/817 [00:15<00:01, 38.53it/s]

 94%|█████████▍| 766/817 [00:15<00:01, 45.05it/s]

 95%|█████████▍| 773/817 [00:15<00:00, 50.61it/s]

 95%|█████████▌| 779/817 [00:15<00:00, 49.12it/s]

 96%|█████████▌| 785/817 [00:15<00:00, 50.09it/s]

 97%|█████████▋| 791/817 [00:15<00:00, 48.96it/s]

 99%|█████████▊| 806/817 [00:15<00:00, 73.50it/s]

100%|█████████▉| 814/817 [00:15<00:00, 68.93it/s]

100%|██████████| 817/817 [00:15<00:00, 51.57it/s]




In [9]:
for d in tqdm(DISEASES):
    pd.concat(diseaseDict[d], axis=1).to_csv(here(f'03_downstream_analysis/08_gene_importance/new_shap_plots/results/SHAP_AVGsamples/SHAP_AVGsample_{cell_type}_{d}.csv'))

  0%|          | 0/20 [00:00<?, ?it/s]

  5%|▌         | 1/20 [00:00<00:14,  1.31it/s]

 10%|█         | 2/20 [00:01<00:14,  1.27it/s]

 15%|█▌        | 3/20 [00:02<00:13,  1.27it/s]

 20%|██        | 4/20 [00:03<00:13,  1.21it/s]

 25%|██▌       | 5/20 [00:03<00:12,  1.25it/s]

 30%|███       | 6/20 [00:04<00:10,  1.30it/s]

 35%|███▌      | 7/20 [00:05<00:10,  1.23it/s]

 40%|████      | 8/20 [00:06<00:09,  1.28it/s]

 45%|████▌     | 9/20 [00:06<00:07,  1.39it/s]

 50%|█████     | 10/20 [00:07<00:07,  1.42it/s]

 55%|█████▌    | 11/20 [00:08<00:07,  1.20it/s]

 60%|██████    | 12/20 [00:09<00:06,  1.23it/s]

 65%|██████▌   | 13/20 [00:10<00:05,  1.25it/s]

 70%|███████   | 14/20 [00:11<00:05,  1.10it/s]

 75%|███████▌  | 15/20 [00:12<00:04,  1.14it/s]

 80%|████████  | 16/20 [00:12<00:03,  1.17it/s]

 85%|████████▌ | 17/20 [00:14<00:02,  1.10it/s]

 90%|█████████ | 18/20 [00:14<00:01,  1.13it/s]

 95%|█████████▌| 19/20 [00:15<00:00,  1.15it/s]

100%|██████████| 20/20 [00:16<00:00,  1.05it/s]

100%|██████████| 20/20 [00:16<00:00,  1.19it/s]


