In [8]:
import pandas as pd
import random
import os
from typing import List, Dict
import json

### Before executing this code:
*    Create the main project folder in the desired location ('GTEx' for me);
*    Put `GTEx Portal.csv` and `gen_sample_list.json` files, which contains the description of histo images, in this project folder;
*    Check `gen_sample_list.json` file to see if it's already in the project folder (should have been created with `prep_genedata.ipynb`).
*    Write the path inside this project folder in project_path.

In [24]:
project_path = 'c:/Projects/GTEx/'

### Import Data

In [10]:
df = pd.read_csv(project_path + 'GTEx Portal.csv')
print(df.shape)             # check the shape
df.head()                   # check the data

(25713, 8)


Unnamed: 0,Tissue Sample ID,Tissue,Subject ID,Sex,Age Bracket,Hardy Scale,Pathology Categories,Pathology Notes
0,GTEX-1117F-0126,Skin - Sun Exposed (Lower leg),GTEX-1117F,female,60-69,Slow death,,"6 pieces, minimal fat, squamous epithelium is ..."
1,GTEX-1117F-0226,Adipose - Subcutaneous,GTEX-1117F,female,60-69,Slow death,,"2 pieces, ~15% vessel stroma, rep delineated"
2,GTEX-1117F-0326,Nerve - Tibial,GTEX-1117F,female,60-69,Slow death,clean_specimens,"2 pieces, clean specimens"
3,GTEX-1117F-0426,Muscle - Skeletal,GTEX-1117F,female,60-69,Slow death,,"2 pieces, !5% fibrous connective tissue, delin..."
4,GTEX-1117F-0526,Artery - Tibial,GTEX-1117F,female,60-69,Slow death,"monckeberg, sclerotic","2 pieces, clean, Monckebeg medial sclerosis, r..."


### Some Dataset transformations for more comfortable further usage

In [12]:
# Rename the column names for further more comfortable usage
df.columns = ['sample_id', 'tissue', 'patient_id', 'sex', 'age', 'hardy_scale', 'path_categories', 'path_notes']
# See the whole list of tissues
df.tissue.value_counts().sort_index().index

Index(['Adipose - Subcutaneous', 'Adipose - Visceral (Omentum)',
       'Adrenal Gland', 'Artery - Aorta', 'Artery - Coronary',
       'Artery - Tibial', 'Bladder', 'Brain - Cerebellum', 'Brain - Cortex',
       'Breast - Mammary Tissue', 'Cervix - Ectocervix', 'Cervix - Endocervix',
       'Colon - Sigmoid', 'Colon - Transverse',
       'Esophagus - Gastroesophageal Junction', 'Esophagus - Mucosa',
       'Esophagus - Muscularis', 'Fallopian Tube', 'Heart - Atrial Appendage',
       'Heart - Left Ventricle', 'Kidney - Cortex', 'Kidney - Medulla',
       'Liver', 'Lung', 'Minor Salivary Gland', 'Muscle - Skeletal',
       'Nerve - Tibial', 'Ovary', 'Pancreas', 'Pituitary', 'Prostate',
       'Skin - Not Sun Exposed (Suprapubic)', 'Skin - Sun Exposed (Lower leg)',
       'Small Intestine - Terminal Ileum', 'Spleen', 'Stomach', 'Testis',
       'Thyroid', 'Uterus', 'Vagina'],
      dtype='object', name='tissue')

In [13]:
# Create two lists of tissues - with full and short names for more comfortable further usage, especially for multiclassification task later...
tissue_full_list = list(df.tissue.value_counts().sort_index().index)
tissue_short_list = ['adipose_sub', 'adipose_visc','gland_adrenal', 'artery_aorta', 'artery_сoronary', 'artery_tibial', 'bladder', 'brain_cerebellum', 'brain_cortex', 'breast_mammary', 'cervix_ecto', 'cervix_endo','colon_sigmoid', 'colon_transverse', 'esophagus_gastro', 'esophagus_mucosa', 'esophagus_muscularis','fallopian_tube', 'heart_atrial', 'heart_ventrical', 'kidney_cortex', 'kidney_medulla', 'liver', 'lung', 'gland_salivary', 'muscle_skeletal', 'nerve_tibial', 'ovary', 'pancreas', 'pituitary', 'prostate', 'skin_notsun',  'skin_sun','intestine_small', 'spleen', 'stomach', 'testis', 'thyroid', 'uterus', 'vagina']
# Validation
print(tissue_full_list[:3])
print(tissue_short_list[:3])
print(len(tissue_short_list), len(tissue_full_list))

['Adipose - Subcutaneous', 'Adipose - Visceral (Omentum)', 'Adrenal Gland']
['adipose_sub', 'adipose_visc', 'gland_adrenal']
40 40


In [14]:
# Create a mapping dictionary
mapping_dict = dict(zip(tissue_full_list, tissue_short_list))
# Create a new column 'tis' by mapping values from 'tissue' column to the dictionary
df['tis'] = df['tissue'].map(mapping_dict)
df.head()

Unnamed: 0,sample_id,tissue,patient_id,sex,age,hardy_scale,path_categories,path_notes,tis
0,GTEX-1117F-0126,Skin - Sun Exposed (Lower leg),GTEX-1117F,female,60-69,Slow death,,"6 pieces, minimal fat, squamous epithelium is ...",skin_sun
1,GTEX-1117F-0226,Adipose - Subcutaneous,GTEX-1117F,female,60-69,Slow death,,"2 pieces, ~15% vessel stroma, rep delineated",adipose_sub
2,GTEX-1117F-0326,Nerve - Tibial,GTEX-1117F,female,60-69,Slow death,clean_specimens,"2 pieces, clean specimens",nerve_tibial
3,GTEX-1117F-0426,Muscle - Skeletal,GTEX-1117F,female,60-69,Slow death,,"2 pieces, !5% fibrous connective tissue, delin...",muscle_skeletal
4,GTEX-1117F-0526,Artery - Tibial,GTEX-1117F,female,60-69,Slow death,"monckeberg, sclerotic","2 pieces, clean, Monckebeg medial sclerosis, r...",artery_tibial


In [36]:
img_sample_list = df.sample_id.to_list()    # 25713
gen_sample_list = json.load(open(project_dir + 'gen_sample_list.json', 'rb'))     # 15585
both_img_gen_list = list(set(img_sample_list) & set(gen_sample_list)) # 13797

In [29]:
with open(project_dir + 'img_sample_list.json', "w") as json_file:
    json.dump(img_sample_list, json_file)
with open(project_dir + 'both_img_gen_list.json', "w") as json_file:
    json.dump(both_img_gen_list, json_file)

In [31]:
df_both = df[df['sample_id'].isin(both_img_gen_list)]   # (13797, 9)
df_both.head()

Unnamed: 0,sample_id,tissue,patient_id,sex,age,hardy_scale,path_categories,path_notes,tis
1,GTEX-1117F-0226,Adipose - Subcutaneous,GTEX-1117F,female,60-69,Slow death,,"2 pieces, ~15% vessel stroma, rep delineated",adipose_sub
3,GTEX-1117F-0426,Muscle - Skeletal,GTEX-1117F,female,60-69,Slow death,,"2 pieces, !5% fibrous connective tissue, delin...",muscle_skeletal
4,GTEX-1117F-0526,Artery - Tibial,GTEX-1117F,female,60-69,Slow death,"monckeberg, sclerotic","2 pieces, clean, Monckebeg medial sclerosis, r...",artery_tibial
5,GTEX-1117F-0626,Artery - Coronary,GTEX-1117F,female,60-69,Slow death,,"2 pieces, up to 4mm aderent fat/nerve/vessel, ...",artery_сoronary
6,GTEX-1117F-0726,Heart - Atrial Appendage,GTEX-1117F,female,60-69,Slow death,no_abnormalities,"2 pieces, no abnormalities",heart_atrial


### Create a convenient dataset for visualization in PowerBI

In [7]:
# Creation of a new dataset for visualization in PowerBI and saving it into `gtex_tis.csv`
df_tis = df.drop(['path_categories', 'path_notes'], axis=1) # these two columns are not needed for visualization
df_tis.to_csv('gtex_tis.csv')
df_tis.head()

Unnamed: 0,sample_id,tissue,patient_id,sex,age,hardy_scale,tis
0,GTEX-1117F-0126,Skin - Sun Exposed (Lower leg),GTEX-1117F,female,60-69,Slow death,skin_sun
1,GTEX-1117F-0226,Adipose - Subcutaneous,GTEX-1117F,female,60-69,Slow death,adipose_sub
2,GTEX-1117F-0326,Nerve - Tibial,GTEX-1117F,female,60-69,Slow death,nerve_tibial
3,GTEX-1117F-0426,Muscle - Skeletal,GTEX-1117F,female,60-69,Slow death,muscle_skeletal
4,GTEX-1117F-0526,Artery - Tibial,GTEX-1117F,female,60-69,Slow death,artery_tibial


### Create a function for image selection

In [32]:
def select_imgs(n_samples: int,
                tissue_list: List[str],
                seed: int = None) -> Dict[str, List[str]]:
    """
    Create a list of sample_ids (images) that will be downloaded from the site.
    Args:
        n_samples (int):    number of images to download
        tissue_list (List[str]): list of tissues which images we want to load
        seed = 42, by default, for reproducability
    Returns:
        img_tis_dict (Dict[str, List[str]]): dictionary of tissues with lists of images to download
    """
    img_tis_dict = {}
    if seed:
        random.seed(seed)

    for tis in tissue_list:
        tis_id = list(df_both[df_both.tis == tis].sample_id)                  # list of sample_id with the certain tissue
        img_tis_dict[tis] = list(random.sample(tis_id, n_samples))   # write list of samples_id with the certain tissue in dict
    return img_tis_dict

### Creation a dictionary - tissue: [list of images] to download

In [33]:
tissues_list = ['lung', 'brain_cortex']
selected_imgs = select_imgs(10, tissues_list, seed=42)
selected_imgs

{'lung': ['GTEX-13VXT-1426',
  'GTEX-11TT1-1626',
  'GTEX-1H23P-1026',
  'GTEX-1F48J-0826',
  'GTEX-1B932-0726',
  'GTEX-14AS3-0926',
  'GTEX-13PL7-1726',
  'GTEX-ZPIC-0626',
  'GTEX-13NZ9-0926',
  'GTEX-QV44-0926'],
 'brain_cortex': ['GTEX-11EMC-3226',
  'GTEX-11EI6-3026',
  'GTEX-12WSA-2926',
  'GTEX-13OW8-2826',
  'GTEX-13S7M-3126',
  'GTEX-1F52S-3126',
  'GTEX-1HBPH-3126',
  'GTEX-11DXY-3226',
  'GTEX-1GZ2Q-3226',
  'GTEX-13O3Q-2926']}

### Saving `selected_imgs` to use in Python script

In [34]:
with open(project_dir+'lung_brain_10.json', "w") as json_file:
    json.dump(selected_imgs, json_file)

In [38]:
check_imgs = json.load(open('lung_brain_10.json', 'rb'))
check_imgs

{'lung': ['GTEX-13VXT-1426',
  'GTEX-11TT1-1626',
  'GTEX-1H23P-1026',
  'GTEX-1F48J-0826',
  'GTEX-1B932-0726',
  'GTEX-14AS3-0926',
  'GTEX-13PL7-1726',
  'GTEX-ZPIC-0626',
  'GTEX-13NZ9-0926',
  'GTEX-QV44-0926'],
 'brain_cortex': ['GTEX-11EMC-3226',
  'GTEX-11EI6-3026',
  'GTEX-12WSA-2926',
  'GTEX-13OW8-2826',
  'GTEX-13S7M-3126',
  'GTEX-1F52S-3126',
  'GTEX-1HBPH-3126',
  'GTEX-11DXY-3226',
  'GTEX-1GZ2Q-3226',
  'GTEX-13O3Q-2926']}

To create final `all_50.json` - select_img(50, tissue_short_list)