In [1]:
# Parameters
cell_type = "T_CD4_Naive"


In [2]:
import sys
import os

import anndata as ad

import pandas as pd
import numpy as np

from pyprojroot import here

import re

from tqdm import tqdm

In [3]:
def load_sorted_shap_values_fnames(
    cell_type: str = '',
    run_name: str = 'run1'):

    dirpath = here(f"inflammabucket_bkp/03_downstream_analysis/05_SHAP/results/04_shap/shap_vals")
    fname_regex = f'^{run_name}_{cell_type}_shap_values_' + r'(\d+)'
    results_batches = [
        (fname, int(re.search(fname_regex, fname).group(1)))
        for fname in os.listdir(dirpath)
        if re.search(fname_regex, fname)]

    sorted_files = sorted(results_batches, key=lambda x: x[1])

    sorted_filenames = [os.path.join(dirpath, filename) for filename, _ in sorted_files]

    return sorted_filenames

In [4]:
shap_val_list = [np.load(fname)['shap_values'] for fname in tqdm(load_sorted_shap_values_fnames(cell_type))] 


  0%|          | 0/80 [00:00<?, ?it/s]

  1%|▏         | 1/80 [00:02<03:06,  2.36s/it]

  2%|▎         | 2/80 [00:04<03:04,  2.37s/it]

  4%|▍         | 3/80 [00:07<03:03,  2.39s/it]

  5%|▌         | 4/80 [00:09<03:01,  2.39s/it]

  6%|▋         | 5/80 [00:11<02:59,  2.40s/it]

  8%|▊         | 6/80 [00:14<02:57,  2.40s/it]

  9%|▉         | 7/80 [00:16<02:55,  2.40s/it]

 10%|█         | 8/80 [00:19<02:52,  2.40s/it]

 11%|█▏        | 9/80 [00:21<02:50,  2.40s/it]

 12%|█▎        | 10/80 [00:23<02:48,  2.40s/it]

 14%|█▍        | 11/80 [00:26<02:46,  2.41s/it]

 15%|█▌        | 12/80 [00:28<02:46,  2.45s/it]

 16%|█▋        | 13/80 [00:31<02:45,  2.48s/it]

 18%|█▊        | 14/80 [00:33<02:44,  2.49s/it]

 19%|█▉        | 15/80 [00:36<02:42,  2.50s/it]

 20%|██        | 16/80 [00:39<02:40,  2.51s/it]

 21%|██▏       | 17/80 [00:41<02:38,  2.52s/it]

 22%|██▎       | 18/80 [00:44<02:36,  2.52s/it]

 24%|██▍       | 19/80 [00:46<02:33,  2.52s/it]

 25%|██▌       | 20/80 [00:49<02:31,  2.52s/it]

 26%|██▋       | 21/80 [00:51<02:29,  2.53s/it]

 28%|██▊       | 22/80 [00:54<02:25,  2.51s/it]

 29%|██▉       | 23/80 [00:56<02:23,  2.53s/it]

 30%|███       | 24/80 [00:59<02:21,  2.52s/it]

 31%|███▏      | 25/80 [01:01<02:18,  2.51s/it]

 32%|███▎      | 26/80 [01:04<02:15,  2.51s/it]

 34%|███▍      | 27/80 [01:06<02:13,  2.52s/it]

 35%|███▌      | 28/80 [01:09<02:10,  2.50s/it]

 36%|███▋      | 29/80 [01:11<02:06,  2.48s/it]

 38%|███▊      | 30/80 [01:14<02:04,  2.49s/it]

 39%|███▉      | 31/80 [01:16<02:02,  2.49s/it]

 40%|████      | 32/80 [01:19<02:01,  2.53s/it]

 41%|████▏     | 33/80 [01:21<01:58,  2.51s/it]

 42%|████▎     | 34/80 [01:24<01:55,  2.50s/it]

 44%|████▍     | 35/80 [01:26<01:52,  2.49s/it]

 45%|████▌     | 36/80 [01:29<01:49,  2.49s/it]

 46%|████▋     | 37/80 [01:31<01:47,  2.51s/it]

 48%|████▊     | 38/80 [01:34<01:45,  2.51s/it]

 49%|████▉     | 39/80 [01:36<01:42,  2.50s/it]

 50%|█████     | 40/80 [01:39<01:39,  2.50s/it]

 51%|█████▏    | 41/80 [01:41<01:37,  2.50s/it]

 52%|█████▎    | 42/80 [01:44<01:34,  2.50s/it]

 54%|█████▍    | 43/80 [01:46<01:32,  2.51s/it]

 55%|█████▌    | 44/80 [01:49<01:30,  2.52s/it]

 56%|█████▋    | 45/80 [01:51<01:27,  2.49s/it]

 57%|█████▊    | 46/80 [01:54<01:24,  2.50s/it]

 59%|█████▉    | 47/80 [01:56<01:22,  2.51s/it]

 60%|██████    | 48/80 [01:59<01:19,  2.49s/it]

 61%|██████▏   | 49/80 [02:01<01:16,  2.48s/it]

 62%|██████▎   | 50/80 [02:04<01:15,  2.51s/it]

 64%|██████▍   | 51/80 [02:06<01:12,  2.49s/it]

 65%|██████▌   | 52/80 [02:09<01:09,  2.49s/it]

 66%|██████▋   | 53/80 [02:11<01:07,  2.49s/it]

 68%|██████▊   | 54/80 [02:14<01:04,  2.49s/it]

 69%|██████▉   | 55/80 [02:16<01:02,  2.49s/it]

 70%|███████   | 56/80 [02:19<00:59,  2.48s/it]

 71%|███████▏  | 57/80 [02:21<00:56,  2.46s/it]

 72%|███████▎  | 58/80 [02:24<00:54,  2.46s/it]

 74%|███████▍  | 59/80 [02:26<00:51,  2.46s/it]

 75%|███████▌  | 60/80 [02:28<00:49,  2.47s/it]

 76%|███████▋  | 61/80 [02:31<00:46,  2.46s/it]

 78%|███████▊  | 62/80 [02:33<00:44,  2.46s/it]

 79%|███████▉  | 63/80 [02:36<00:41,  2.45s/it]

 80%|████████  | 64/80 [02:38<00:39,  2.45s/it]

 81%|████████▏ | 65/80 [02:41<00:36,  2.44s/it]

 82%|████████▎ | 66/80 [02:43<00:34,  2.45s/it]

 84%|████████▍ | 67/80 [02:46<00:31,  2.44s/it]

 85%|████████▌ | 68/80 [02:48<00:29,  2.45s/it]

 86%|████████▋ | 69/80 [02:50<00:26,  2.45s/it]

 88%|████████▊ | 70/80 [02:53<00:24,  2.45s/it]

 89%|████████▉ | 71/80 [02:55<00:21,  2.44s/it]

 90%|█████████ | 72/80 [02:58<00:19,  2.43s/it]

 91%|█████████▏| 73/80 [03:00<00:17,  2.43s/it]

 92%|█████████▎| 74/80 [03:03<00:14,  2.43s/it]

 94%|█████████▍| 75/80 [03:05<00:12,  2.44s/it]

 95%|█████████▌| 76/80 [03:07<00:09,  2.43s/it]

 96%|█████████▋| 77/80 [03:10<00:07,  2.42s/it]

 98%|█████████▊| 78/80 [03:12<00:04,  2.43s/it]

 99%|█████████▉| 79/80 [03:15<00:02,  2.43s/it]

100%|██████████| 80/80 [03:16<00:00,  1.93s/it]

100%|██████████| 80/80 [03:16<00:00,  2.45s/it]




In [5]:
adata = ad.read_h5ad(here(f'inflammabucket_bkp/03_downstream_analysis/05_SHAP/data/{cell_type}_adataMerged_SPECTRAgenes.log1p.h5ad'), backed='r')

In [6]:
## Gene symbols and Disease labels
symbols_df = pd.read_pickle(here('inflammabucket_bkp/03_downstream_analysis/04_selected_gene_list.pkl'))
symbols_sorted = symbols_df.loc[adata.var_names].symbol.values

DISEASES = ['BRCA', 'CD', 'COPD', 'COVID', 'CRC', 'HBV', 'HIV', 'HNSCC', 'MS', 'NPC', 'PS', 'PSA', 'RA', 'SLE', 'UC', 'asthma', 'cirrhosis', 'flu', 'healthy', 'sepsis']

diseaseDict = dict()
for d in DISEASES:
    diseaseDict[d] = []

In [7]:
shape_values_matrix = np.concatenate(shap_val_list)

In [8]:
for idx, values in tqdm(adata.obs.groupby('sampleID', observed=True).indices.items()):
    geneXdisease_sample_i = pd.DataFrame(shape_values_matrix[values].mean(0))
    geneXdisease_sample_i.columns = DISEASES
    geneXdisease_sample_i.index = symbols_sorted
    for d in geneXdisease_sample_i.columns:
        diseaseDict[d].append(pd.DataFrame.from_dict({idx:geneXdisease_sample_i[d]}))

  0%|          | 0/814 [00:00<?, ?it/s]

  1%|          | 7/814 [00:00<00:12, 62.66it/s]

  2%|▏         | 14/814 [00:00<00:15, 52.33it/s]

  2%|▏         | 20/814 [00:00<00:15, 50.80it/s]

  3%|▎         | 27/814 [00:00<00:15, 49.58it/s]

  4%|▍         | 34/814 [00:00<00:14, 55.12it/s]

  5%|▍         | 40/814 [00:00<00:13, 56.17it/s]

  6%|▌         | 48/814 [00:00<00:12, 62.28it/s]

  7%|▋         | 55/814 [00:00<00:12, 59.87it/s]

  8%|▊         | 62/814 [00:01<00:12, 60.78it/s]

  8%|▊         | 69/814 [00:01<00:13, 56.22it/s]

  9%|▉         | 77/814 [00:01<00:12, 61.12it/s]

 10%|█         | 84/814 [00:01<00:13, 55.58it/s]

 11%|█▏        | 93/814 [00:01<00:11, 63.59it/s]

 13%|█▎        | 108/814 [00:01<00:08, 85.82it/s]

 15%|█▌        | 125/814 [00:01<00:06, 107.89it/s]

 17%|█▋        | 137/814 [00:02<00:08, 83.68it/s] 

 18%|█▊        | 147/814 [00:02<00:07, 84.76it/s]

 19%|█▉        | 157/814 [00:02<00:07, 86.75it/s]

 21%|██        | 167/814 [00:02<00:08, 75.48it/s]

 22%|██▏       | 183/814 [00:02<00:06, 94.77it/s]

 24%|██▍       | 194/814 [00:02<00:06, 98.48it/s]

 26%|██▌       | 208/814 [00:02<00:05, 108.85it/s]

 27%|██▋       | 220/814 [00:02<00:05, 103.51it/s]

 28%|██▊       | 231/814 [00:02<00:05, 100.63it/s]

 30%|██▉       | 243/814 [00:03<00:05, 104.93it/s]

 31%|███▏      | 255/814 [00:03<00:05, 108.94it/s]

 33%|███▎      | 267/814 [00:03<00:05, 109.16it/s]

 34%|███▍      | 279/814 [00:03<00:06, 84.27it/s] 

 36%|███▌      | 289/814 [00:03<00:06, 83.58it/s]

 37%|███▋      | 299/814 [00:03<00:07, 69.21it/s]

 38%|███▊      | 307/814 [00:04<00:09, 53.00it/s]

 39%|███▊      | 314/814 [00:04<00:09, 52.08it/s]

 39%|███▉      | 321/814 [00:04<00:09, 54.54it/s]

 40%|████      | 329/814 [00:04<00:08, 58.79it/s]

 41%|████▏     | 336/814 [00:04<00:08, 58.40it/s]

 42%|████▏     | 343/814 [00:04<00:07, 59.05it/s]

 43%|████▎     | 350/814 [00:04<00:07, 61.13it/s]

 44%|████▍     | 359/814 [00:04<00:06, 67.58it/s]

 45%|████▌     | 370/814 [00:05<00:05, 77.60it/s]

 46%|████▋     | 378/814 [00:05<00:05, 73.83it/s]

 48%|████▊     | 388/814 [00:05<00:05, 80.82it/s]

 50%|█████     | 408/814 [00:05<00:03, 113.19it/s]

 52%|█████▏    | 420/814 [00:05<00:03, 104.44it/s]

 53%|█████▎    | 431/814 [00:05<00:04, 90.52it/s] 

 54%|█████▍    | 441/814 [00:05<00:04, 82.32it/s]

 55%|█████▌    | 450/814 [00:05<00:04, 75.95it/s]

 56%|█████▋    | 458/814 [00:06<00:05, 67.92it/s]

 57%|█████▋    | 466/814 [00:06<00:06, 51.67it/s]

 58%|█████▊    | 472/814 [00:06<00:06, 49.06it/s]

 59%|█████▊    | 478/814 [00:06<00:07, 46.97it/s]

 60%|█████▉    | 485/814 [00:06<00:06, 51.61it/s]

 60%|██████    | 492/814 [00:06<00:05, 55.40it/s]

 61%|██████▏   | 499/814 [00:06<00:05, 58.69it/s]

 62%|██████▏   | 506/814 [00:07<00:05, 58.44it/s]

 63%|██████▎   | 513/814 [00:07<00:05, 59.71it/s]

 64%|██████▍   | 520/814 [00:07<00:05, 56.20it/s]

 65%|██████▍   | 526/814 [00:07<00:05, 55.08it/s]

 65%|██████▌   | 532/814 [00:07<00:05, 56.23it/s]

 66%|██████▌   | 538/814 [00:07<00:05, 49.43it/s]

 67%|██████▋   | 545/814 [00:07<00:05, 52.78it/s]

 68%|██████▊   | 551/814 [00:07<00:05, 49.94it/s]

 69%|██████▉   | 560/814 [00:08<00:04, 58.86it/s]

 70%|██████▉   | 567/814 [00:08<00:04, 52.91it/s]

 71%|███████   | 574/814 [00:08<00:04, 56.60it/s]

 71%|███████▏  | 580/814 [00:08<00:04, 56.53it/s]

 72%|███████▏  | 589/814 [00:08<00:03, 63.59it/s]

 73%|███████▎  | 596/814 [00:08<00:03, 57.03it/s]

 74%|███████▍  | 602/814 [00:08<00:03, 55.13it/s]

 75%|███████▍  | 608/814 [00:08<00:03, 51.77it/s]

 75%|███████▌  | 614/814 [00:09<00:04, 45.44it/s]

 76%|███████▌  | 619/814 [00:09<00:04, 45.87it/s]

 77%|███████▋  | 626/814 [00:09<00:03, 48.87it/s]

 78%|███████▊  | 632/814 [00:09<00:04, 43.41it/s]

 79%|███████▊  | 640/814 [00:09<00:03, 51.47it/s]

 79%|███████▉  | 646/814 [00:09<00:03, 48.11it/s]

 80%|████████  | 653/814 [00:10<00:03, 40.54it/s]

 81%|████████  | 659/814 [00:10<00:03, 42.84it/s]

 82%|████████▏ | 664/814 [00:10<00:03, 40.99it/s]

 82%|████████▏ | 669/814 [00:10<00:03, 39.30it/s]

 83%|████████▎ | 674/814 [00:10<00:03, 35.59it/s]

 83%|████████▎ | 678/814 [00:10<00:03, 34.26it/s]

 84%|████████▍ | 682/814 [00:10<00:03, 33.80it/s]

 84%|████████▍ | 687/814 [00:10<00:03, 36.51it/s]

 85%|████████▍ | 691/814 [00:11<00:03, 37.05it/s]

 85%|████████▌ | 695/814 [00:11<00:03, 36.88it/s]

 86%|████████▌ | 699/814 [00:11<00:03, 34.61it/s]

 86%|████████▋ | 704/814 [00:11<00:02, 36.77it/s]

 87%|████████▋ | 708/814 [00:11<00:03, 34.71it/s]

 87%|████████▋ | 712/814 [00:11<00:02, 35.93it/s]

 88%|████████▊ | 716/814 [00:11<00:03, 32.51it/s]

 88%|████████▊ | 720/814 [00:11<00:02, 33.43it/s]

 89%|████████▉ | 724/814 [00:12<00:02, 33.23it/s]

 89%|████████▉ | 728/814 [00:12<00:02, 34.23it/s]

 90%|████████▉ | 732/814 [00:12<00:02, 34.91it/s]

 91%|█████████ | 737/814 [00:12<00:02, 37.24it/s]

 91%|█████████ | 742/814 [00:12<00:01, 39.27it/s]

 92%|█████████▏| 746/814 [00:12<00:01, 37.87it/s]

 92%|█████████▏| 751/814 [00:12<00:01, 38.97it/s]

 93%|█████████▎| 755/814 [00:12<00:01, 37.05it/s]

 93%|█████████▎| 760/814 [00:12<00:01, 38.37it/s]

 94%|█████████▍| 764/814 [00:13<00:01, 34.05it/s]

 94%|█████████▍| 768/814 [00:13<00:01, 35.10it/s]

 95%|█████████▍| 772/814 [00:13<00:01, 36.13it/s]

 95%|█████████▌| 776/814 [00:13<00:01, 34.54it/s]

 96%|█████████▌| 780/814 [00:13<00:01, 33.11it/s]

 96%|█████████▋| 784/814 [00:13<00:01, 29.29it/s]

 97%|█████████▋| 788/814 [00:13<00:00, 31.62it/s]

 98%|█████████▊| 794/814 [00:13<00:00, 35.46it/s]

 99%|█████████▉| 804/814 [00:14<00:00, 49.78it/s]

100%|█████████▉| 810/814 [00:14<00:00, 50.48it/s]

100%|██████████| 814/814 [00:14<00:00, 57.07it/s]




In [9]:
for d in tqdm(DISEASES):
    pd.concat(diseaseDict[d], axis=1).to_csv(here(f'03_downstream_analysis/08_gene_importance/new_shap_plots/results/SHAP_AVGsamples/SHAP_AVGsample_{cell_type}_{d}.csv'))

  0%|          | 0/20 [00:00<?, ?it/s]

  5%|▌         | 1/20 [00:00<00:11,  1.65it/s]

 10%|█         | 2/20 [00:01<00:10,  1.79it/s]

 15%|█▌        | 3/20 [00:01<00:09,  1.75it/s]

 20%|██        | 4/20 [00:02<00:10,  1.59it/s]

 25%|██▌       | 5/20 [00:02<00:08,  1.76it/s]

 30%|███       | 6/20 [00:03<00:07,  1.84it/s]

 35%|███▌      | 7/20 [00:03<00:06,  1.95it/s]

 40%|████      | 8/20 [00:04<00:06,  1.84it/s]

 45%|████▌     | 9/20 [00:04<00:05,  1.97it/s]

 50%|█████     | 10/20 [00:05<00:04,  2.03it/s]

 55%|█████▌    | 11/20 [00:05<00:04,  1.87it/s]

 60%|██████    | 12/20 [00:06<00:04,  1.89it/s]

 65%|██████▌   | 13/20 [00:06<00:03,  1.91it/s]

 70%|███████   | 14/20 [00:07<00:03,  1.71it/s]

 75%|███████▌  | 15/20 [00:08<00:02,  1.75it/s]

 80%|████████  | 16/20 [00:08<00:02,  1.79it/s]

 85%|████████▌ | 17/20 [00:09<00:01,  1.89it/s]

 90%|█████████ | 18/20 [00:09<00:01,  1.74it/s]

 95%|█████████▌| 19/20 [00:10<00:00,  1.73it/s]

100%|██████████| 20/20 [00:11<00:00,  1.75it/s]

100%|██████████| 20/20 [00:11<00:00,  1.81it/s]


