In [1]:
# Parameters
cell_type = "ILC"


In [2]:
import sys
import os

import anndata as ad

import pandas as pd
import numpy as np

from pyprojroot import here

import re

from tqdm import tqdm

In [3]:
def load_sorted_shap_values_fnames(
    cell_type: str = '',
    run_name: str = 'run1'):

    dirpath = here(f"inflammabucket_bkp/03_downstream_analysis/05_SHAP/results/04_shap/shap_vals")
    fname_regex = f'^{run_name}_{cell_type}_shap_values_' + r'(\d+)'
    results_batches = [
        (fname, int(re.search(fname_regex, fname).group(1)))
        for fname in os.listdir(dirpath)
        if re.search(fname_regex, fname)]

    sorted_files = sorted(results_batches, key=lambda x: x[1])

    sorted_filenames = [os.path.join(dirpath, filename) for filename, _ in sorted_files]

    return sorted_filenames

In [4]:
shap_val_list = [np.load(fname)['shap_values'] for fname in tqdm(load_sorted_shap_values_fnames(cell_type))] 


  0%|          | 0/38 [00:00<?, ?it/s]

  3%|▎         | 1/38 [00:03<02:21,  3.83s/it]

  5%|▌         | 2/38 [00:07<02:21,  3.92s/it]

  8%|▊         | 3/38 [00:11<02:15,  3.86s/it]

 11%|█         | 4/38 [00:16<02:31,  4.46s/it]

 13%|█▎        | 5/38 [00:21<02:24,  4.39s/it]

 16%|█▌        | 6/38 [00:25<02:16,  4.25s/it]

 18%|█▊        | 7/38 [00:29<02:07,  4.11s/it]

 21%|██        | 8/38 [00:34<02:15,  4.51s/it]

 24%|██▎       | 9/38 [00:38<02:05,  4.31s/it]

 26%|██▋       | 10/38 [00:41<01:54,  4.09s/it]

 29%|██▉       | 11/38 [00:46<01:51,  4.13s/it]

 32%|███▏      | 12/38 [00:50<01:45,  4.07s/it]

 34%|███▍      | 13/38 [00:53<01:36,  3.88s/it]

 37%|███▋      | 14/38 [00:56<01:29,  3.74s/it]

 39%|███▉      | 15/38 [01:02<01:35,  4.16s/it]

 42%|████▏     | 16/38 [01:06<01:31,  4.18s/it]

 45%|████▍     | 17/38 [01:11<01:33,  4.47s/it]

 47%|████▋     | 18/38 [01:16<01:31,  4.56s/it]

 50%|█████     | 19/38 [01:20<01:24,  4.45s/it]

 53%|█████▎    | 20/38 [01:23<01:14,  4.14s/it]

 55%|█████▌    | 21/38 [01:27<01:08,  4.02s/it]

 58%|█████▊    | 22/38 [01:31<01:02,  3.90s/it]

 61%|██████    | 23/38 [01:34<00:57,  3.85s/it]

 63%|██████▎   | 24/38 [01:39<00:55,  3.98s/it]

 66%|██████▌   | 25/38 [01:43<00:51,  3.95s/it]

 68%|██████▊   | 26/38 [01:46<00:45,  3.83s/it]

 71%|███████   | 27/38 [01:49<00:40,  3.69s/it]

 74%|███████▎  | 28/38 [01:53<00:35,  3.60s/it]

 76%|███████▋  | 29/38 [01:56<00:32,  3.57s/it]

 79%|███████▉  | 30/38 [02:00<00:27,  3.48s/it]

 82%|████████▏ | 31/38 [02:03<00:23,  3.43s/it]

 84%|████████▍ | 32/38 [02:06<00:20,  3.38s/it]

 87%|████████▋ | 33/38 [02:10<00:16,  3.37s/it]

 89%|████████▉ | 34/38 [02:13<00:13,  3.34s/it]

 92%|█████████▏| 35/38 [02:16<00:09,  3.31s/it]

 95%|█████████▍| 36/38 [02:19<00:06,  3.30s/it]

 97%|█████████▋| 37/38 [02:23<00:03,  3.28s/it]

100%|██████████| 38/38 [02:25<00:00,  2.97s/it]

100%|██████████| 38/38 [02:25<00:00,  3.82s/it]




In [5]:
adata = ad.read_h5ad(here(f'inflammabucket_bkp/03_downstream_analysis/05_SHAP/data/{cell_type}_adataMerged_SPECTRAgenes.log1p.h5ad'), backed='r')

In [6]:
## Gene symbols and Disease labels
symbols_df = pd.read_pickle(here('inflammabucket_bkp/03_downstream_analysis/04_selected_gene_list.pkl'))
symbols_sorted = symbols_df.loc[adata.var_names].symbol.values

DISEASES = ['BRCA', 'CD', 'COPD', 'COVID', 'CRC', 'HBV', 'HIV', 'HNSCC', 'MS', 'NPC', 'PS', 'PSA', 'RA', 'SLE', 'UC', 'asthma', 'cirrhosis', 'flu', 'healthy', 'sepsis']

diseaseDict = dict()
for d in DISEASES:
    diseaseDict[d] = []

In [7]:
shape_values_matrix = np.concatenate(shap_val_list)

In [8]:
for idx, values in tqdm(adata.obs.groupby('sampleID', observed=True).indices.items()):
    geneXdisease_sample_i = pd.DataFrame(shape_values_matrix[values].mean(0))
    geneXdisease_sample_i.columns = DISEASES
    geneXdisease_sample_i.index = symbols_sorted
    for d in geneXdisease_sample_i.columns:
        diseaseDict[d].append(pd.DataFrame.from_dict({idx:geneXdisease_sample_i[d]}))

  0%|          | 0/817 [00:00<?, ?it/s]

  1%|          | 9/817 [00:00<00:09, 89.62it/s]

  2%|▏         | 18/817 [00:00<00:12, 66.23it/s]

  4%|▍         | 36/817 [00:00<00:07, 108.81it/s]

  6%|▌         | 48/817 [00:00<00:07, 109.28it/s]

  8%|▊         | 63/817 [00:00<00:06, 120.48it/s]

 10%|▉         | 78/817 [00:00<00:05, 128.69it/s]

 11%|█▏        | 92/817 [00:00<00:05, 131.73it/s]

 14%|█▍        | 118/817 [00:00<00:04, 169.49it/s]

 17%|█▋        | 136/817 [00:01<00:05, 128.15it/s]

 19%|█▊        | 152/817 [00:01<00:04, 134.85it/s]

 21%|██        | 168/817 [00:01<00:05, 119.79it/s]

 22%|██▏       | 183/817 [00:01<00:05, 126.52it/s]

 25%|██▍       | 201/817 [00:01<00:04, 139.31it/s]

 27%|██▋       | 218/817 [00:01<00:04, 146.78it/s]

 29%|██▉       | 235/817 [00:01<00:03, 152.38it/s]

 31%|███       | 251/817 [00:01<00:03, 145.46it/s]

 33%|███▎      | 268/817 [00:02<00:03, 146.51it/s]

 35%|███▍      | 283/817 [00:02<00:03, 147.00it/s]

 36%|███▋      | 298/817 [00:02<00:03, 146.70it/s]

 38%|███▊      | 313/817 [00:02<00:03, 126.09it/s]

 40%|████      | 330/817 [00:02<00:03, 134.24it/s]

 42%|████▏     | 344/817 [00:02<00:03, 125.99it/s]

 44%|████▎     | 357/817 [00:02<00:03, 119.82it/s]

 46%|████▌     | 377/817 [00:02<00:03, 139.84it/s]

 48%|████▊     | 392/817 [00:02<00:02, 142.27it/s]

 50%|█████     | 411/817 [00:03<00:03, 107.96it/s]

 52%|█████▏    | 428/817 [00:03<00:03, 118.97it/s]

 54%|█████▍    | 442/817 [00:03<00:03, 114.86it/s]

 56%|█████▌    | 455/817 [00:03<00:03, 114.75it/s]

 58%|█████▊    | 472/817 [00:03<00:02, 126.58it/s]

 60%|█████▉    | 490/817 [00:03<00:02, 136.65it/s]

 62%|██████▏   | 505/817 [00:03<00:02, 133.69it/s]

 64%|██████▎   | 519/817 [00:03<00:02, 135.32it/s]

 65%|██████▌   | 533/817 [00:04<00:02, 126.72it/s]

 67%|██████▋   | 548/817 [00:04<00:02, 131.43it/s]

 69%|██████▉   | 562/817 [00:04<00:02, 122.68it/s]

 70%|███████   | 575/817 [00:04<00:02, 117.93it/s]

 72%|███████▏  | 587/817 [00:04<00:01, 116.79it/s]

 73%|███████▎  | 599/817 [00:04<00:02, 105.36it/s]

 75%|███████▍  | 610/817 [00:04<00:02, 92.02it/s] 

 76%|███████▌  | 620/817 [00:05<00:02, 90.07it/s]

 77%|███████▋  | 630/817 [00:05<00:02, 82.32it/s]

 78%|███████▊  | 640/817 [00:05<00:02, 84.29it/s]

 79%|███████▉  | 649/817 [00:05<00:02, 79.06it/s]

 81%|████████  | 658/817 [00:05<00:02, 66.96it/s]

 82%|████████▏ | 667/817 [00:05<00:02, 69.52it/s]

 83%|████████▎ | 675/817 [00:05<00:02, 70.04it/s]

 84%|████████▎ | 683/817 [00:05<00:01, 70.05it/s]

 85%|████████▍ | 693/817 [00:06<00:01, 76.58it/s]

 86%|████████▌ | 704/817 [00:06<00:01, 84.33it/s]

 87%|████████▋ | 713/817 [00:06<00:01, 79.85it/s]

 88%|████████▊ | 722/817 [00:06<00:01, 79.78it/s]

 89%|████████▉ | 731/817 [00:06<00:01, 70.19it/s]

 91%|█████████ | 741/817 [00:06<00:01, 75.90it/s]

 92%|█████████▏| 749/817 [00:06<00:00, 69.56it/s]

 93%|█████████▎| 757/817 [00:06<00:00, 67.54it/s]

 94%|█████████▍| 768/817 [00:07<00:00, 77.25it/s]

 95%|█████████▌| 778/817 [00:07<00:00, 82.40it/s]

 96%|█████████▋| 787/817 [00:07<00:00, 74.43it/s]

 98%|█████████▊| 800/817 [00:07<00:00, 85.86it/s]

 99%|█████████▉| 809/817 [00:07<00:00, 85.25it/s]

100%|██████████| 817/817 [00:07<00:00, 107.73it/s]




In [9]:
for d in tqdm(DISEASES):
    pd.concat(diseaseDict[d], axis=1).to_csv(here(f'03_downstream_analysis/08_gene_importance/new_shap_plots/results/SHAP_AVGsamples/SHAP_AVGsample_{cell_type}_{d}.csv'))

  0%|          | 0/20 [00:00<?, ?it/s]

  5%|▌         | 1/20 [00:00<00:12,  1.54it/s]

 10%|█         | 2/20 [00:01<00:16,  1.06it/s]

 15%|█▌        | 3/20 [00:03<00:19,  1.13s/it]

 20%|██        | 4/20 [00:04<00:17,  1.12s/it]

 25%|██▌       | 5/20 [00:05<00:16,  1.09s/it]

 30%|███       | 6/20 [00:06<00:14,  1.07s/it]

 35%|███▌      | 7/20 [00:07<00:13,  1.04s/it]

 40%|████      | 8/20 [00:08<00:14,  1.18s/it]

 45%|████▌     | 9/20 [00:09<00:10,  1.01it/s]

 50%|█████     | 10/20 [00:10<00:10,  1.04s/it]

 55%|█████▌    | 11/20 [00:12<00:11,  1.22s/it]

 60%|██████    | 12/20 [00:13<00:10,  1.25s/it]

 65%|██████▌   | 13/20 [00:14<00:09,  1.33s/it]

 70%|███████   | 14/20 [00:16<00:09,  1.52s/it]

 75%|███████▌  | 15/20 [00:18<00:08,  1.65s/it]

 80%|████████  | 16/20 [00:21<00:07,  1.79s/it]

 85%|████████▌ | 17/20 [00:22<00:04,  1.63s/it]

 90%|█████████ | 18/20 [00:24<00:03,  1.68s/it]

 95%|█████████▌| 19/20 [00:25<00:01,  1.60s/it]

100%|██████████| 20/20 [00:27<00:00,  1.62s/it]

100%|██████████| 20/20 [00:27<00:00,  1.36s/it]


