In [1]:
# Parameters
cell_type = "T_CD4_NonNaive"


In [2]:
import sys
import os

import anndata as ad

import pandas as pd
import numpy as np

from pyprojroot import here

import re

from tqdm import tqdm

In [3]:
def load_sorted_shap_values_fnames(
    cell_type: str = '',
    run_name: str = 'run1'):

    dirpath = here(f"inflammabucket_bkp/03_downstream_analysis/05_SHAP/results/04_shap/shap_vals")
    fname_regex = f'^{run_name}_{cell_type}_shap_values_' + r'(\d+)'
    results_batches = [
        (fname, int(re.search(fname_regex, fname).group(1)))
        for fname in os.listdir(dirpath)
        if re.search(fname_regex, fname)]

    sorted_files = sorted(results_batches, key=lambda x: x[1])

    sorted_filenames = [os.path.join(dirpath, filename) for filename, _ in sorted_files]

    return sorted_filenames

In [4]:
shap_val_list = [np.load(fname)['shap_values'] for fname in tqdm(load_sorted_shap_values_fnames(cell_type))] 


  0%|          | 0/72 [00:00<?, ?it/s]

  1%|▏         | 1/72 [00:03<04:02,  3.42s/it]

  3%|▎         | 2/72 [00:07<04:06,  3.52s/it]

  4%|▍         | 3/72 [00:10<04:05,  3.55s/it]

  6%|▌         | 4/72 [00:14<04:01,  3.56s/it]

  7%|▋         | 5/72 [00:17<03:58,  3.55s/it]

  8%|▊         | 6/72 [00:21<03:54,  3.55s/it]

 10%|▉         | 7/72 [00:24<03:51,  3.56s/it]

 11%|█         | 8/72 [00:28<03:47,  3.55s/it]

 12%|█▎        | 9/72 [00:31<03:43,  3.55s/it]

 14%|█▍        | 10/72 [00:35<03:39,  3.55s/it]

 15%|█▌        | 11/72 [00:39<03:37,  3.57s/it]

 17%|█▋        | 12/72 [00:42<03:33,  3.56s/it]

 18%|█▊        | 13/72 [00:46<03:29,  3.55s/it]

 19%|█▉        | 14/72 [00:49<03:25,  3.55s/it]

 21%|██        | 15/72 [00:53<03:22,  3.54s/it]

 22%|██▏       | 16/72 [00:56<03:20,  3.57s/it]

 24%|██▎       | 17/72 [01:00<03:16,  3.57s/it]

 25%|██▌       | 18/72 [01:04<03:13,  3.58s/it]

 26%|██▋       | 19/72 [01:07<03:09,  3.58s/it]

 28%|██▊       | 20/72 [01:11<03:06,  3.59s/it]

 29%|██▉       | 21/72 [01:14<03:02,  3.57s/it]

 31%|███       | 22/72 [01:18<02:57,  3.55s/it]

 32%|███▏      | 23/72 [01:21<02:54,  3.56s/it]

 33%|███▎      | 24/72 [01:25<02:50,  3.55s/it]

 35%|███▍      | 25/72 [01:28<02:45,  3.53s/it]

 36%|███▌      | 26/72 [01:32<02:42,  3.54s/it]

 38%|███▊      | 27/72 [01:35<02:38,  3.51s/it]

 39%|███▉      | 28/72 [01:39<02:35,  3.52s/it]

 40%|████      | 29/72 [01:42<02:31,  3.52s/it]

 42%|████▏     | 30/72 [01:46<02:27,  3.52s/it]

 43%|████▎     | 31/72 [01:49<02:24,  3.52s/it]

 44%|████▍     | 32/72 [01:53<02:21,  3.53s/it]

 46%|████▌     | 33/72 [01:57<02:17,  3.53s/it]

 47%|████▋     | 34/72 [02:00<02:14,  3.53s/it]

 49%|████▊     | 35/72 [02:04<02:10,  3.53s/it]

 50%|█████     | 36/72 [02:07<02:06,  3.52s/it]

 51%|█████▏    | 37/72 [02:11<02:03,  3.52s/it]

 53%|█████▎    | 38/72 [02:14<02:00,  3.55s/it]

 54%|█████▍    | 39/72 [02:18<01:56,  3.53s/it]

 56%|█████▌    | 40/72 [02:21<01:52,  3.51s/it]

 57%|█████▋    | 41/72 [02:25<01:49,  3.53s/it]

 58%|█████▊    | 42/72 [02:28<01:45,  3.52s/it]

 60%|█████▉    | 43/72 [02:32<01:41,  3.51s/it]

 61%|██████    | 44/72 [02:35<01:38,  3.51s/it]

 62%|██████▎   | 45/72 [02:39<01:34,  3.49s/it]

 64%|██████▍   | 46/72 [02:42<01:30,  3.50s/it]

 65%|██████▌   | 47/72 [02:46<01:27,  3.51s/it]

 67%|██████▋   | 48/72 [02:49<01:24,  3.51s/it]

 68%|██████▊   | 49/72 [02:53<01:20,  3.52s/it]

 69%|██████▉   | 50/72 [02:56<01:17,  3.52s/it]

 71%|███████   | 51/72 [03:00<01:13,  3.52s/it]

 72%|███████▏  | 52/72 [03:03<01:10,  3.53s/it]

 74%|███████▎  | 53/72 [03:07<01:07,  3.53s/it]

 75%|███████▌  | 54/72 [03:10<01:03,  3.54s/it]

 76%|███████▋  | 55/72 [03:14<01:00,  3.55s/it]

 78%|███████▊  | 56/72 [03:18<00:56,  3.54s/it]

 79%|███████▉  | 57/72 [03:21<00:52,  3.53s/it]

 81%|████████  | 58/72 [03:25<00:49,  3.53s/it]

 82%|████████▏ | 59/72 [03:28<00:45,  3.53s/it]

 83%|████████▎ | 60/72 [03:32<00:42,  3.53s/it]

 85%|████████▍ | 61/72 [03:35<00:38,  3.50s/it]

 86%|████████▌ | 62/72 [03:39<00:34,  3.50s/it]

 88%|████████▊ | 63/72 [03:42<00:31,  3.50s/it]

 89%|████████▉ | 64/72 [03:46<00:27,  3.48s/it]

 90%|█████████ | 65/72 [03:49<00:24,  3.50s/it]

 92%|█████████▏| 66/72 [03:53<00:21,  3.51s/it]

 93%|█████████▎| 67/72 [03:56<00:17,  3.50s/it]

 94%|█████████▍| 68/72 [04:00<00:14,  3.51s/it]

 96%|█████████▌| 69/72 [04:03<00:10,  3.52s/it]

 97%|█████████▋| 70/72 [04:07<00:07,  3.52s/it]

 99%|█████████▊| 71/72 [04:10<00:03,  3.52s/it]

100%|██████████| 72/72 [04:11<00:00,  2.72s/it]

100%|██████████| 72/72 [04:11<00:00,  3.49s/it]




In [5]:
adata = ad.read_h5ad(here(f'inflammabucket_bkp/03_downstream_analysis/05_SHAP/data/{cell_type}_adataMerged_SPECTRAgenes.log1p.h5ad'), backed='r')

In [6]:
## Gene symbols and Disease labels
symbols_df = pd.read_pickle(here('inflammabucket_bkp/03_downstream_analysis/04_selected_gene_list.pkl'))
symbols_sorted = symbols_df.loc[adata.var_names].symbol.values

DISEASES = ['BRCA', 'CD', 'COPD', 'COVID', 'CRC', 'HBV', 'HIV', 'HNSCC', 'MS', 'NPC', 'PS', 'PSA', 'RA', 'SLE', 'UC', 'asthma', 'cirrhosis', 'flu', 'healthy', 'sepsis']

diseaseDict = dict()
for d in DISEASES:
    diseaseDict[d] = []

In [7]:
shape_values_matrix = np.concatenate(shap_val_list)

In [8]:
for idx, values in tqdm(adata.obs.groupby('sampleID', observed=True).indices.items()):
    geneXdisease_sample_i = pd.DataFrame(shape_values_matrix[values].mean(0))
    geneXdisease_sample_i.columns = DISEASES
    geneXdisease_sample_i.index = symbols_sorted
    for d in geneXdisease_sample_i.columns:
        diseaseDict[d].append(pd.DataFrame.from_dict({idx:geneXdisease_sample_i[d]}))

  0%|          | 0/816 [00:00<?, ?it/s]

  1%|          | 5/816 [00:00<00:19, 41.91it/s]

  1%|          | 10/816 [00:00<00:21, 37.71it/s]

  2%|▏         | 14/816 [00:00<00:21, 37.94it/s]

  2%|▏         | 19/816 [00:00<00:19, 41.78it/s]

  3%|▎         | 27/816 [00:00<00:14, 54.34it/s]

  4%|▍         | 33/816 [00:00<00:14, 54.68it/s]

  5%|▌         | 41/816 [00:00<00:13, 58.89it/s]

  6%|▌         | 49/816 [00:00<00:11, 64.27it/s]

  7%|▋         | 56/816 [00:01<00:12, 60.13it/s]

  8%|▊         | 63/816 [00:01<00:11, 62.85it/s]

  9%|▊         | 70/816 [00:01<00:12, 62.06it/s]

  9%|▉         | 77/816 [00:01<00:12, 61.41it/s]

 10%|█         | 84/816 [00:01<00:12, 58.99it/s]

 11%|█▏        | 92/816 [00:01<00:11, 64.30it/s]

 13%|█▎        | 110/816 [00:01<00:07, 94.46it/s]

 16%|█▌        | 128/816 [00:01<00:05, 116.75it/s]

 17%|█▋        | 140/816 [00:02<00:08, 78.59it/s] 

 18%|█▊        | 150/816 [00:02<00:08, 80.64it/s]

 20%|█▉        | 162/816 [00:02<00:07, 87.17it/s]

 21%|██        | 172/816 [00:02<00:08, 78.04it/s]

 22%|██▏       | 183/816 [00:02<00:07, 85.29it/s]

 24%|██▎       | 193/816 [00:02<00:07, 86.01it/s]

 25%|██▌       | 207/816 [00:02<00:06, 98.77it/s]

 27%|██▋       | 218/816 [00:02<00:06, 93.71it/s]

 28%|██▊       | 231/816 [00:03<00:05, 100.84it/s]

 30%|██▉       | 242/816 [00:03<00:05, 96.54it/s] 

 31%|███       | 253/816 [00:03<00:05, 98.50it/s]

 32%|███▏      | 265/816 [00:03<00:05, 102.55it/s]

 34%|███▍      | 276/816 [00:03<00:05, 92.59it/s] 

 35%|███▌      | 288/816 [00:03<00:05, 99.27it/s]

 37%|███▋      | 299/816 [00:03<00:05, 100.71it/s]

 38%|███▊      | 310/816 [00:03<00:05, 88.19it/s] 

 40%|███▉      | 323/816 [00:03<00:05, 98.24it/s]

 41%|████      | 336/816 [00:04<00:04, 105.03it/s]

 43%|████▎     | 347/816 [00:04<00:05, 88.93it/s] 

 44%|████▍     | 359/816 [00:04<00:04, 96.42it/s]

 46%|████▌     | 373/816 [00:04<00:04, 103.80it/s]

 47%|████▋     | 384/816 [00:04<00:04, 92.91it/s] 

 50%|████▉     | 406/816 [00:04<00:03, 124.20it/s]

 51%|█████▏    | 420/816 [00:04<00:03, 117.55it/s]

 53%|█████▎    | 433/816 [00:05<00:03, 113.12it/s]

 55%|█████▍    | 445/816 [00:05<00:03, 96.77it/s] 

 56%|█████▌    | 456/816 [00:05<00:03, 93.60it/s]

 57%|█████▋    | 466/816 [00:05<00:04, 85.34it/s]

 58%|█████▊    | 475/816 [00:05<00:04, 80.78it/s]

 59%|█████▉    | 484/816 [00:05<00:04, 80.08it/s]

 60%|██████    | 493/816 [00:05<00:04, 78.11it/s]

 62%|██████▏   | 502/816 [00:05<00:03, 79.79it/s]

 63%|██████▎   | 511/816 [00:06<00:03, 82.41it/s]

 64%|██████▎   | 520/816 [00:06<00:03, 79.93it/s]

 65%|██████▍   | 529/816 [00:06<00:03, 76.25it/s]

 66%|██████▌   | 537/816 [00:06<00:03, 73.03it/s]

 67%|██████▋   | 546/816 [00:06<00:03, 75.86it/s]

 68%|██████▊   | 554/816 [00:06<00:03, 71.69it/s]

 69%|██████▉   | 565/816 [00:06<00:03, 78.02it/s]

 70%|███████   | 573/816 [00:06<00:03, 76.36it/s]

 71%|███████   | 581/816 [00:06<00:03, 73.75it/s]

 72%|███████▏  | 591/816 [00:07<00:02, 77.12it/s]

 73%|███████▎  | 599/816 [00:07<00:02, 77.58it/s]

 74%|███████▍  | 607/816 [00:07<00:02, 76.92it/s]

 75%|███████▌  | 615/816 [00:07<00:02, 74.89it/s]

 76%|███████▋  | 624/816 [00:07<00:02, 76.90it/s]

 77%|███████▋  | 632/816 [00:07<00:03, 60.40it/s]

 78%|███████▊  | 639/816 [00:07<00:03, 56.53it/s]

 79%|███████▉  | 646/816 [00:07<00:02, 57.98it/s]

 80%|████████  | 653/816 [00:08<00:03, 47.33it/s]

 81%|████████  | 661/816 [00:08<00:02, 51.84it/s]

 82%|████████▏ | 667/816 [00:08<00:03, 48.02it/s]

 82%|████████▏ | 673/816 [00:08<00:03, 47.16it/s]

 83%|████████▎ | 678/816 [00:08<00:02, 47.75it/s]

 84%|████████▎ | 683/816 [00:08<00:02, 47.62it/s]

 84%|████████▍ | 688/816 [00:08<00:02, 48.16it/s]

 85%|████████▌ | 694/816 [00:09<00:02, 48.02it/s]

 86%|████████▌ | 700/816 [00:09<00:02, 49.57it/s]

 87%|████████▋ | 706/816 [00:09<00:02, 46.71it/s]

 87%|████████▋ | 711/816 [00:09<00:02, 41.45it/s]

 88%|████████▊ | 716/816 [00:09<00:02, 41.20it/s]

 88%|████████▊ | 721/816 [00:09<00:02, 39.99it/s]

 89%|████████▉ | 726/816 [00:09<00:02, 37.41it/s]

 89%|████████▉ | 730/816 [00:09<00:02, 37.04it/s]

 90%|█████████ | 735/816 [00:10<00:02, 38.99it/s]

 91%|█████████ | 741/816 [00:10<00:01, 41.99it/s]

 91%|█████████▏| 746/816 [00:10<00:01, 41.90it/s]

 92%|█████████▏| 751/816 [00:10<00:01, 40.18it/s]

 93%|█████████▎| 756/816 [00:10<00:01, 37.22it/s]

 93%|█████████▎| 761/816 [00:10<00:01, 38.20it/s]

 94%|█████████▍| 766/816 [00:10<00:01, 40.77it/s]

 94%|█████████▍| 771/816 [00:10<00:01, 40.23it/s]

 95%|█████████▌| 777/816 [00:11<00:00, 42.98it/s]

 96%|█████████▌| 782/816 [00:11<00:00, 40.36it/s]

 96%|█████████▋| 787/816 [00:11<00:00, 35.74it/s]

 97%|█████████▋| 794/816 [00:11<00:00, 42.62it/s]

 99%|█████████▉| 806/816 [00:11<00:00, 61.09it/s]

100%|█████████▉| 813/816 [00:11<00:00, 59.86it/s]

100%|██████████| 816/816 [00:11<00:00, 69.23it/s]




In [9]:
for d in tqdm(DISEASES):
    pd.concat(diseaseDict[d], axis=1).to_csv(here(f'03_downstream_analysis/08_gene_importance/new_shap_plots/results/SHAP_AVGsamples/SHAP_AVGsample_{cell_type}_{d}.csv'))

  0%|          | 0/20 [00:00<?, ?it/s]

  5%|▌         | 1/20 [00:00<00:12,  1.48it/s]

 10%|█         | 2/20 [00:01<00:12,  1.44it/s]

 15%|█▌        | 3/20 [00:02<00:11,  1.46it/s]

 20%|██        | 4/20 [00:02<00:11,  1.35it/s]

 25%|██▌       | 5/20 [00:03<00:10,  1.45it/s]

 30%|███       | 6/20 [00:04<00:09,  1.48it/s]

 35%|███▌      | 7/20 [00:04<00:08,  1.60it/s]

 40%|████      | 8/20 [00:05<00:07,  1.54it/s]

 45%|████▌     | 9/20 [00:05<00:06,  1.64it/s]

 50%|█████     | 10/20 [00:06<00:05,  1.71it/s]

 55%|█████▌    | 11/20 [00:07<00:05,  1.52it/s]

 60%|██████    | 12/20 [00:07<00:05,  1.53it/s]

 65%|██████▌   | 13/20 [00:08<00:04,  1.50it/s]

 70%|███████   | 14/20 [00:09<00:04,  1.39it/s]

 75%|███████▌  | 15/20 [00:10<00:03,  1.40it/s]

 80%|████████  | 16/20 [00:10<00:02,  1.43it/s]

 85%|████████▌ | 17/20 [00:11<00:02,  1.50it/s]

 90%|█████████ | 18/20 [00:12<00:01,  1.37it/s]

 95%|█████████▌| 19/20 [00:12<00:00,  1.37it/s]

100%|██████████| 20/20 [00:13<00:00,  1.35it/s]

100%|██████████| 20/20 [00:13<00:00,  1.45it/s]


