# Download TARGET Pan Cancer Data for Compression
The notebook downloads gene expression and clinical data from the TARGET project. The data is downloaded from UCSC Xena.
The data is in log2(FPKM) RSEM transformed

In [1]:
import pathlib
from pathlib import Path
import hashlib
from urllib.request import urlretrieve
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
def download_and_checksum(url_base, file_info_dict, file_url_paths):
    """
    Downloads files from the specified base URL and computes the SHA-256 checksum for each file.

    Parameters:
    ----------
    url_base : str
        The base URL from which the files will be downloaded.

    file_info_dict : dict
        A dictionary where the keys are file names and the values are the local file paths where 
        the files will be saved.

    file_url_paths : dict
        A dictionary mapping each file name to the corresponding sub-path (if any) that should 
        be appended to the base URL before downloading the file.
    """
    for name, path in file_info_dict.items():
        # Determine the correct URL with or without sub-path
        file_url = f"{url_base}{file_url_paths[name]}{name}"
        
        # Download the file
        urlretrieve(file_url, path)

        # Compute checksum
        md5_hash = hashlib.md5()
        with open(path, "rb") as f:
            for byte_block in iter(lambda: f.read(4096), b""):
                md5_hash.update(byte_block)

        print(f"{name} checksum: {md5_hash.hexdigest()}")

In [3]:
# Base URL
url_base = 'https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/'
data_dir = pathlib.Path('data').resolve()

In [4]:
# Dictionary for file names and their paths
file_info_dict = {
    'target_RSEM_gene_fpkm.gz': Path('data/target_RSEM_gene_fpkm.gz'),
    'TARGET_phenotype.gz': Path('data/TARGET_phenotype.gz'),
    'gencode.v23.annotation.gene.probemap': Path('data/gencode.v23.annotation.gene.probemap')
}

file_url_paths = {
    "target_RSEM_gene_fpkm.gz": "",
    "gencode.v23.annotation.gene.probemap": "probeMap/",
    "TARGET_phenotype.gz": ""
}


In [5]:
# Create the 'data' directory
Path('data').mkdir(exist_ok=True)

In [6]:
# Call the function
download_and_checksum(url_base, file_info_dict, file_url_paths)

target_RSEM_gene_fpkm.gz checksum: 4584cca6e5befebaead5f1865f23f517321dfc52cf9e752d87219f0bc33a14fc
TARGET_phenotype.gz checksum: b52ec780daaa940dbfdd76f5f764cde2567ba6e6acbbab7e889030ef9474a5cc
gencode.v23.annotation.gene.probemap checksum: 6783ea58791ae876efb697889a042cc7be8e32e40fc01191c622ef25d9416931


# Process TARGET PanCancer Data
Retrive the downloaded expression data, update gene identifiers to entrez, and curate sample IDs. The script will also identify a balanced hold-out test set to compare projection performance into learned latent spaces across algorithms. 

In [7]:
random.seed(0)

# Read Phenotype Information

In [8]:
pheno_file = data_dir / 'TARGET_phenotype.parquet'
if not pheno_file.is_file():
    pheno_df = pd.read_table(data_dir / 'TARGET_phenotype.gz')
    pheno_df.to_parquet(pheno_file, index=False)
else:
    pheno_df = pd.read_parquet(pheno_file)

print(pheno_df.shape)
pheno_df.head(3)

(5958, 7)


Unnamed: 0,sample_id,primary_disease_code,_primary_disease,sample_type_code,_sample_type,_PATIENT,_cohort
0,TARGET-00-NAAEMA-20,Non cancerous tissue,Non cancerous tissue,CELLC,Control Analyte,NAAEMA,TARGET
1,TARGET-00-NAAEMB-20,Non cancerous tissue,Non cancerous tissue,CELLC,Control Analyte,NAAEMB,TARGET
2,TARGET-00-NAAEMC-20,Non cancerous tissue,Non cancerous tissue,CELLC,Control Analyte,NAAEMC,TARGET


# Read Entrez ID Curation Information
Load curated gene names from versioned resource. See https://github.com/cognoma/genes for more details

In [9]:
# Commit from https://github.com/cognoma/genes
genes_commit = 'ad9631bb4e77e2cdc5413b0d77cb8f7e93fc5bee'

In [10]:
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(genes_commit)
gene_df = pd.read_table(url)

# Only consider protein-coding genes
gene_df = (
   gene_df.query("gene_type == 'protein-coding'")
)


print(gene_df.shape)
gene_df.head(2)

(20395, 7)


Unnamed: 0,entrez_gene_id,symbol,description,chromosome,gene_type,synonyms,aliases
0,1,A1BG,alpha-1-B glycoprotein,19,protein-coding,A1B|ABG|GAB|HYST2477,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...
1,2,A2M,alpha-2-macroglobulin,12,protein-coding,A2MD|CPAMD5|FWP007|S863-7,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...


In [11]:
# Load gene updater - old to new Entrez gene identifiers
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(genes_commit)
updater_df = pd.read_table(url)
old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,
                            updater_df.new_entrez_gene_id))

# Read Probe Mapping Info

In [12]:
probe_map_file = data_dir / 'gencode.v23.annotation.gene.probemap.parquet'
if not probe_map_file.is_file():
    probe_map_df = pd.read_table(data_dir / 'gencode.v23.annotation.gene.probemap')
    probe_map_df.to_parquet(probe_map_file, index=False)
else:
    probe_map_df = pd.read_parquet(probe_map_file)

# Inner merge gene df to get ensembl to entrez mapping
probe_map_df = probe_map_df.merge(gene_df, how='inner', left_on='gene', right_on='symbol')
ensembl_to_entrez = dict(zip(probe_map_df.id, probe_map_df.entrez_gene_id))

print(probe_map_df.shape)
probe_map_df.head(3)


(18855, 13)


Unnamed: 0,id,gene,chrom,chromStart,chromEnd,strand,entrez_gene_id,symbol,description,chromosome,gene_type,synonyms,aliases
0,ENSG00000186092.4,OR4F5,chr1,69091,70008,+,79501,OR4F5,olfactory receptor family 4 subfamily F member 5,1,protein-coding,,olfactory receptor 4F5
1,ENSG00000278566.1,OR4F29,chr1,450740,451678,-,729759,OR4F29,olfactory receptor family 4 subfamily F member 29,1,protein-coding,OR7-21,olfactory receptor 4F3/4F16/4F29|olfactory rec...
2,ENSG00000273547.1,OR4F16,chr1,685716,686654,-,81399,OR4F16,olfactory receptor family 4 subfamily F member 16,1,protein-coding,OR1-1|OR7-21,olfactory receptor 4F3/4F16/4F29|olfactory rec...


# Read Gene Expression Data

In [13]:
expr_file = data_dir / 'target_RSEM_gene_fpkm.parquet'
if not expr_file.is_file():
    expr_df = pd.read_table(data_dir / 'target_RSEM_gene_fpkm.gz', index_col=0)
    expr_df.to_parquet(expr_file)
else:
    expr_df = pd.read_parquet(expr_file)

print(expr_df.shape)
expr_df.head(2)

(60498, 734)


Unnamed: 0_level_0,TARGET-30-PASWYR-01,TARGET-20-PARUBT-09,TARGET-30-PASNZU-01,TARGET-52-PASDLA-11,TARGET-50-PAKNRX-01,TARGET-30-PASWFB-01,TARGET-30-PALUYS-01,TARGET-30-PAUDDK-01,TARGET-10-PAPZNK-09,TARGET-50-PAJLNJ-01,...,TARGET-20-PANLIZ-04,TARGET-21-PASSLT-41,TARGET-20-PASTTW-09,TARGET-50-PAKYLT-01,TARGET-20-PATJHJ-09,TARGET-21-PATKKJ-41,TARGET-10-PAPEJN-04,TARGET-20-PABLDZ-09,TARGET-10-PANSBR-09,TARGET-10-PARFLV-04
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000242268.2,-3.0469,-9.9658,-3.0469,-4.6082,-5.5735,-9.9658,-9.9658,-9.9658,-9.9658,-2.2447,...,-9.9658,-9.9658,-4.035,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-4.2934,-9.9658
ENSG00000259041.1,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,...,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658


# Process gene expression matrix 

In [14]:
expr_df = (expr_df
   .dropna(axis='rows')
   .reindex(probe_map_df.id)
   .rename(index=ensembl_to_entrez)
   .rename(index=old_to_new_entrez)
   .groupby(level=0).mean()
   .transpose()
   .sort_index(axis='rows')
   .sort_index(axis='columns')
)


expr_df.index.rename('sample_id', inplace=True)


print(expr_df.shape)
expr_df.head(2)

(734, 18753)


id,1,2,9,10,12,13,14,15,16,18,...,102724231,102724398,102724473,102724536,102724631,102724862,102724928,105375355,105378803,105378952
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TARGET-10-PAKSWW-03,5.3754,-1.1488,-1.4305,-9.9658,-7.76965,-9.9658,4.3786,-1.3183,2.0289,0.7321,...,-2.6349,-9.9658,-4.6082,0.1648,-9.9658,-5.0116,-9.9658,-0.8863,-9.9658,-9.9658
TARGET-10-PAMXHJ-09,4.9388,-1.2828,0.2881,-9.9658,-9.9658,-9.9658,3.866,1.4011,3.0791,2.6232,...,-2.2447,-4.2934,-4.6082,0.4016,-9.9658,-4.2934,-9.9658,-2.0529,-9.9658,-9.9658


# Stratify Balanced Training and Testing Sets in TARGET Gene Expression
Output training and testing gene expression datasets 

In [15]:
strat = pheno_df.set_index('sample_id').reindex(expr_df.index).primary_disease_code

In [16]:
cancertype_count_df = pd.DataFrame(strat.value_counts()).reset_index().rename({'index': 'cancertype', 'primary_disease_code': 'n ='}, axis='columns')
sample_counts_file = data_dir / 'target_sample_counts.parquet'
cancertype_count_df.to_parquet(sample_counts_file)

cancertype_count_df

Unnamed: 0,n =,count
0,AML,196
1,ALL,194
2,NBL,162
3,WT,132
4,AML-IF,32
5,CCSK,13
6,RT,5


In [17]:
train_df, test_df = train_test_split(expr_df,
                                    test_size=0.2,
                                    random_state=0,
                                    stratify=strat)

In [18]:
print(train_df.shape)
test_df.shape

(587, 18753)


(147, 18753)

In [19]:
train_file = data_dir / 'train_target_expression_matrix_processed.parquet'
train_df.to_parquet(train_file)

In [20]:
test_file = data_dir / 'test_target_expression_matrix_processed.parquet'
test_df.to_parquet(test_file)

# Sort genes based on median absolute deviation and output to file

In [None]:
mad_genes_df = pd.DataFrame(train_df.mad(axis=0).sort_values(ascending=False)).reset_index()
mad_genes_df.columns = ['gene_id', 'median_absolute_deviation']
mad_genes_file = data_dir / 'target_mad_genes.parquet'
mad_genes_df.to_parquet(mad_genes_file)