# Data download

Description: Download all external data required to execute the code for Chapters 2-4. 

In [1]:
# !pip install urllib3

In [2]:
import os 
import shutil, gzip
import zipfile, tarfile
from urllib import request

In [3]:
def get_local_data_path(folders, fname=None):
    folder_structure = 'external_data/'+'/'.join(folders)
    os.makedirs(os.path.normpath(folder_structure), exist_ok=True) 
    if(fname!=None):
        return(os.path.normpath(folder_structure +'/'+ fname))
    else:
        return(os.path.normpath(folder_structure +'/'))

def extract_file(fname, destination_fname):
    if fname.endswith("tar.gz"):
        tar = tarfile.open(fname, "r:gz")
        tar.extractall(destination_fname)
        tar.close()
    elif fname.endswith("tar"):
        tar = tarfile.open(fname, "r:")
        tar.extractall(destination_fname)
        tar.close()
    elif fname.endswith(".gz"):
        with gzip.open(fname, 'rb') as f_in:
            with open(destination_fname, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    elif fname.endswith("zip"):
        with zipfile.ZipFile(fname) as zf:
            zf.extractall(destination_fname)

## Chapter 2: 

#### Experimental reproducibility limits the correlation between mRNA and protein abundances in tumour profiles

In [4]:
## 1. Colorectal Cancer (2014)
##    A. Transcriptomics
request.urlretrieve("https://cbioportal-datahub.s3.amazonaws.com/coadread_tcga_pub.tar.gz", 
              get_local_data_path(["tumour_studies", "crc"], "coadread_tcga_pub.tar.gz"))
extract_file(get_local_data_path(["tumour_studies", "crc"], "coadread_tcga_pub.tar.gz"), 
             get_local_data_path(["tumour_studies", "crc"]))

##    B. Proteomics 
request.urlretrieve("https://static-content.springer.com/esm/art%3A10.1038%2Fnature13438/MediaObjects/41586_2014_BFnature13438_MOESM5_ESM.xlsx", 
              get_local_data_path(["tumour_studies", "crc", "zhang_2014"],
                                        "Supplemental_Table.xlsx"))


## 2. Breast Cancer (2016)
##    A. Transcriptomics
request.urlretrieve("https://cbioportal-datahub.s3.amazonaws.com/brca_tcga_pub2015.tar.gz", 
              get_local_data_path(["tumour_studies", "brca"], "brca_tcga_pub2015.tar.gz"))
extract_file(get_local_data_path(["tumour_studies", "brca"], "brca_tcga_pub2015.tar.gz"), 
             get_local_data_path(["tumour_studies", "brca"]))

##    B. Proteomics 
request.urlretrieve("https://static-content.springer.com/esm/art%3A10.1038%2Fnature18003/MediaObjects/41586_2016_BFnature18003_MOESM111_ESM.zip", 
              get_local_data_path(["tumour_studies", "brca", "mertins_2016"], "BrCa_2016_Supplemental_Tables.zip"))

extract_file(get_local_data_path(["tumour_studies", "brca", "mertins_2016"], "BrCa_2016_Supplemental_Tables.zip"), 
             get_local_data_path(["tumour_studies", "brca", "mertins_2016"]))

shutil.move(get_local_data_path(["tumour_studies", "brca", "mertins_2016", "nature18003-s2"], 
                                "CPTAC_BC_SupplementaryTable03.xlsx"), 
            get_local_data_path(["tumour_studies", "brca", "mertins_2016"], 
                                "CPTAC_BC_SupplementaryTable03.xlsx"))

## 3. Ovarian Cancer (2016)
##    A. Transcriptomics
request.urlretrieve("http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/OV/20160128/gdac.broadinstitute.org_OV.mRNA_Preprocess_Median.Level_3.2016012800.0.0.tar.gz", 
              get_local_data_path(["tumour_studies", "ovca"], "ovca_tcga_pub2015.tar.gz"))
extract_file(get_local_data_path(["tumour_studies", "ovca"], "ovca_tcga_pub2015.tar.gz"), 
             get_local_data_path(["tumour_studies", "ovca"]))

##    B. Proteomics 
request.urlretrieve("http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/OV/20160128/gdac.broadinstitute.org_OV.mRNA_Preprocess_Median.Level_3.2016012800.0.0.tar.gz", 
              get_local_data_path(["tumour_studies", "ovca", "firebrowse"], "OV.mRNA_Preprocess_Median.Level_3.2016012800.0.0.tar.gz"))

extract_file(get_local_data_path(["tumour_studies", "ovca", "firebrowse"], "OV.mRNA_Preprocess_Median.Level_3.2016012800.0.0.tar.gz"), 
             get_local_data_path(["tumour_studies", "ovca", "firebrowse"], "OV.mRNA"))
shutil.move(get_local_data_path(["tumour_studies", "ovca", "firebrowse", "OV.mRNA", 
                                "gdac.broadinstitute.org_OV.mRNA_Preprocess_Median.Level_3.2016012800.0.0"], 
                                "OV.medianexp.txt"), 
            get_local_data_path(["tumour_studies", "ovca", "firebrowse"], "OV.medianexp.txt"))

## 4. Colon Adenocarcinoma (2019)
##    A. Transcriptomics
request.urlretrieve("http://linkedomics.org/cptac-colon/Human__CPTAC_COAD__UNC__RNAseq__HiSeq_RNA__03_01_2017__BCM__Gene__BCM_RSEM_UpperQuartile_log2.cct.gz", 
              get_local_data_path(["tumour_studies", "colon", "vasaikar_2019"], "RNAseq.cct"))

##    B. Proteomics 
##       TMT 10-plex Mass spectrometry data:
request.urlretrieve("http://linkedomics.org/cptac-colon/Human__CPTAC_COAD__PNNL__Proteome__TMT__03_01_2017__BCM__Gene__Tumor_Normal_log2FC.cct", 
              get_local_data_path(["tumour_studies", "colon", "vasaikar_2019"], "Proteome_TMT.cct"))

##       LF Mass spectrometry data
request.urlretrieve("http://linkedomics.org/cptac-colon/Human__CPTAC_COAD__VU__Proteome__QExact__03_01_2017__BCM__Gene__VU_Tumor_LF_UnsharedCounts.cct", 
              get_local_data_path(["tumour_studies", "colon", "vasaikar_2019"], "Proteome_LF.cct"))


## 5. Cancer Cell Lines Encyclopedia (2020)
##    A. Transcriptomics
request.urlretrieve("https://ndownloader.figshare.com/files/25494389", 
              get_local_data_path(["depmap", "20Q4"], "CCLE_expression.csv"))

##    B. Cell Line Mapping File 
request.urlretrieve("https://ndownloader.figshare.com/files/25494443", 
              get_local_data_path(["depmap", "20Q4"], "sample.csv"))

##    C. For processing transcriptomic replicates:
request.urlretrieve("https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-019-1186-3/MediaObjects/41586_2019_1186_MOESM4_ESM.xlsx", 
              get_local_data_path(["ghandi_2019"], "Table_S1.xlsx"))

##    D. Proteomics Data
request.urlretrieve("https://gygi.hms.harvard.edu/data/ccle/Table_S2_Protein_Quant_Normalized.xlsx", 
              get_local_data_path(["nusinow_2020"], "Table_S2.xlsx"))

##    E. Protein expression levels in biological replicates
request.urlretrieve("https://gygi.hms.harvard.edu/data/ccle/Table_S3_Biological_Replicates_Protein_Quant_Normalized.xlsx", 
              get_local_data_path(["nusinow_2020"], "Table_S3.xlsx"))

## The commented code indicate the urls in those lines cannot be accessed programmatically and 
## will have to be downloaded manually! 

## 6. NCI60 cancer cell lines (2019)
##    A. Transcriptomics data 

# request.urlretrieve("https://www.cell.com/cms/10.1016/j.isci.2019.10.059/attachment/816cfb48-4f4d-4ace-843b-e4300bd85b7a/mmc7.xlsx", 
#               get_local_data_path(["guo_2019"], "Table_S6.xlsx"))

##    B. Proteomics data 
# request.urlretrieve("https://www.cell.com/cms/10.1016/j.isci.2019.10.059/attachment/30dfcf80-68fd-4f82-aef4-e27ab8bacc47/mmc2.xlsx", 
#               get_local_data_path(["guo_2019"], "Table S1.xlsx"))

# ## 7. GTEx 32 Healthy Tissues (2020)
# ##    A. Transcriptomics Data
# request.urlretrieve("https://www.cell.com/cms/10.1016/j.cell.2020.08.036/attachment/bab7713d-6ee6-4d19-badc-aceb88535518/mmc4.xlsx", 
#               get_local_data_path(["jiang_2020"], "Table_S4.xlsx"))

# ##    B. Proteomics Data
# request.urlretrieve("https://www.cell.com/cms/10.1016/j.cell.2020.08.036/attachment/0d79a576-f9b7-4342-81a8-f80c14df0372/mmc3.xlsx", 
#               get_local_data_path(["jiang_2020"], "Table_S3.xlsx"))

# ##    C. Experiment Info
# request.urlretrieve("https://www.cell.com/cms/10.1016/j.cell.2020.08.036/attachment/e874e52c-b2b9-4771-8ceb-9415fed1750b/mmc5.xlsx", 
#               get_local_data_path(["jiang_2020"], "Table_S5.xlsx"))

## 8. Protein half-lives
request.urlretrieve("https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5930408/bin/supp_RA118.000583_134915_1_supp_65961_p3f7xq.xlsx", 
              get_local_data_path(["protein_properties", "zecha_2018"], "Table_S3.xlsx"))

## 9. Protein Complexes
request.urlretrieve("https://mips.helmholtz-muenchen.de/corum/download/releases/old/corum_2018_09_03.zip", 
              get_local_data_path(["protein_properties", "CORUM"], "allComplexes.zip"))
extract_file(get_local_data_path(["protein_properties", "CORUM"], "allComplexes.zip"), 
             get_local_data_path(["protein_properties", "CORUM"]))
extract_file(get_local_data_path(["protein_properties", "CORUM", "current"], "allComplexes.txt.zip"), 
             get_local_data_path(["protein_properties", "CORUM"]))

## 10. NCI CPTAC DREAM Challenge best performer's prediction score 
# Received from authors directly.
request.urlretrieve("https://raw.githubusercontent.com/SwathiRUpadhya/PhD_thesis/main/other_data/mi_yang_25Sept2020/guan_sc2_breast_cor.txt", 
              get_local_data_path(["mi_yang_25Sept2020"], "guan_sc2_breast_cor.txt"))
request.urlretrieve("https://raw.githubusercontent.com/SwathiRUpadhya/PhD_thesis/main/other_data/mi_yang_25Sept2020/guan_sc2_ovarian_cor.txt", 
              get_local_data_path(["mi_yang_25Sept2020"], "guan_sc2_ovarian_cor.txt"))

## 11. Transcriptomics data of 675 human cancer cell lines
# request.urlretrieve("https://www.ebi.ac.uk/gxa/experiments-content/E-MTAB-2706/resources/ExperimentDownloadSupplier.RnaSeqBaseline/tpms.tsv", 
#               get_local_data_path(["klijn_2015"], "E-MTAB-2706-query-results.tpms.tsv"))

('external_data\\mi_yang_25Sept2020\\guan_sc2_ovarian_cor.txt',
 <http.client.HTTPMessage at 0x25d99262940>)

## Chapter 3: 
#### Anitbody reliability limits the correlation between mRNA and protein abundances in tumours 

In [5]:
# 1. Antibody validation status
#    A. TCPA antibody validation status
request.urlretrieve("https://static-content.springer.com/esm/art%3A10.1038%2Fnmeth.2650/MediaObjects/41592_2013_BFnmeth2650_MOESM330_ESM.xlsx", 
              get_local_data_path(['jun_li_2013'], 'nmeth.2650-S2.xlsx'))

#    B. MD Anderson Cancer Center:
request.urlretrieve("https://www.mdanderson.org/content/dam/mdanderson/documents/core-facilities/Functional%20Proteomics%20RPPA%20Core%20Facility/RPPA_Standard_Ab_List_Updated.xlsx", 
              get_local_data_path(['mdanderson_cancer_center'], 'RPPA_Standard_Ab_List_Updated.xlsx'))

#    C. MD Anderson Cancer Center - Corrected Gene Names
request.urlretrieve("https://www.mdanderson.org/content/dam/mdanderson/documents/core-facilities/Functional%20Proteomics%20RPPA%20Core%20Facility/Corrected_Gene_Names.pdf", 
              get_local_data_path(['mdanderson_cancer_center'], 'Corrected_Gene_Names.pdf'))

# 2. Tumour studies
#    A. TCGA transcriptomics:
request.urlretrieve("https://tcga-pancan-atlas-hub.s3.us-east-1.amazonaws.com/download/EB%2B%2BAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena.gz", 
              get_local_data_path(['usc_xena_browser'], 'EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena.gz'))
extract_file(get_local_data_path(['usc_xena_browser'], 'EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena.gz'), 
             get_local_data_path(['usc_xena_browser'], 'EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena'))

#    B. TCGA proteomics:
request.urlretrieve("https://tcga-pancan-atlas-hub.s3.us-east-1.amazonaws.com/download/TCGA-RPPA-pancan-clean.xena.gz", 
              get_local_data_path(['usc_xena_browser'], 'TCGA-RPPA-pancan-clean.xena.gz'))
extract_file(get_local_data_path(['usc_xena_browser'], 'TCGA-RPPA-pancan-clean.xena.gz'), 
             get_local_data_path(['usc_xena_browser'], 'TCGA-RPPA-pancan-clean.xena')), 
             

#    C. Mapping samples through GDC Reference file
request.urlretrieve("http://api.gdc.cancer.gov/data/fcbb373e-28d4-4818-92f3-601ede3da5e1", 
              get_local_data_path(['gdc_cancer_gov'], 'TCGA-RPPA-pancan-clean.txt'))

# 3. Cancer Cell Lines Encyclopedia (2020)
#    A. Transcriptomics
request.urlretrieve("https://ndownloader.figshare.com/files/25494389", 
              get_local_data_path(["depmap", "20Q4"], "CCLE_expression.csv"))

#    B. Cell Line Mapping File 
request.urlretrieve("https://ndownloader.figshare.com/files/25494443", 
              get_local_data_path(["depmap", "20Q4"], "sample.csv"))

#    C. Proteomics Data - Mass spec
request.urlretrieve("https://gygi.hms.harvard.edu/data/ccle/Table_S2_Protein_Quant_Normalized.xlsx", 
              get_local_data_path(["nusinow_2020"], "Table_S2.xlsx"))

#    D. Proteomics Data - RPPA data
request.urlretrieve("https://depmap.org/portal/download/api/download?file_name=ccle%2FCCLE_RPPA_20180123.csv&bucket=depmap-external-downloads", 
              get_local_data_path(["depmap", "ProteinArray_RPPA"], "CCLE_RPPA_20180123.csv"))

#    E. Antibody validation status file
request.urlretrieve("https://depmap.org/portal/download/api/download?file_name=ccle%2FCCLE_RPPA_Ab_info_20180123.csv&bucket=depmap-external-downloads", 
              get_local_data_path(["depmap", "ProteinArray_RPPA"], "CCLE_RPPA_Ab_info_20180123.csv"))

### same files are downloaded in Project 1 already. 
# ## 4. GTEx 32 Healthy Tissues (2020)
# ##    A. Transcriptomics Data
# request.urlretrieve("https://www.cell.com/cms/10.1016/j.cell.2020.08.036/attachment/bab7713d-6ee6-4d19-badc-aceb88535518/mmc4.xlsx", 
#               get_local_data_path(["jiang_2020"], "Table_S4.xlsx"))

# ##    B. Proteomics Data
# request.urlretrieve("https://www.cell.com/cms/10.1016/j.cell.2020.08.036/attachment/0d79a576-f9b7-4342-81a8-f80c14df0372/mmc3.xlsx", 
#               get_local_data_path(["jiang_2020"], "Table_S3.xlsx"))

## difficulty in accessing the data programmatically
# 5. Aggregated Protein Reproducibility
# request.urlretrieve("https://www.cell.com/cell-reports-methods/fulltext/S2667-2375(22)00170-9", 
#               get_local_data_path(["upadhya_ryan_2022"], "TableS2.xlsx"))

('external_data\\depmap\\ProteinArray_RPPA\\CCLE_RPPA_Ab_info_20180123.csv',
 <http.client.HTTPMessage at 0x25d99251580>)

## Chapter 4: 
#### Latent space models aid in the prediction of protein abundances in tumours

In [6]:
#### 1. Linkedomics Transcriptomics Data: 

request.urlretrieve("https://linkedomics.org/data_download/TCGA-BRCA/Human__TCGA_BRCA__UNC__RNAseq__HiSeq_RNA__01_28_2016__BI__Gene__Firehose_RSEM_log2.cct.gz", 
              get_local_data_path(['transcriptomics_data'], 'TCGA_BRCA_RNAseq_RSEM_log2.cct'))

request.urlretrieve("https://linkedomics.org/data_download/TCGA-OV/Human__TCGA_OV__UNC__RNAseq__HiSeq_RNA__01_28_2016__BI__Gene__Firehose_RSEM_log2.cct.gz", 
              get_local_data_path(['transcriptomics_data'], 'TCGA_OV_RNAseq_RSEM_log2.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-PDAC/mRNA_RSEM_UQ_log2_Tumor.cct", 
              get_local_data_path(['transcriptomics_data'], 'CPTAC_PDAC_mRNA_RSEM_UQ_log2_Tumor.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-BRCA/HS_CPTAC_BRCA_2018_RNA_GENE.cct", 
              get_local_data_path(['transcriptomics_data'], 'HS_CPTAC_BRCA_2018_RNA_GENE.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-OV/HS_CPTAC_OV_rnaseq_fpkm_log2.cct", 
              get_local_data_path(['transcriptomics_data'], 'HS_CPTAC_OV_rnaseq_fpkm_log2.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-CCRCC/HS_CPTAC_CCRCC_RNAseq_fpkm_log2_Tumor.cct", 
              get_local_data_path(['transcriptomics_data'], 'HS_CPTAC_CCRCC_RNAseq_fpkm_log2_Tumor.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-LUAD/HS_CPTAC_LUAD_rnaseq_uq_rpkm_log2_NArm_TUMOR.cct", 
              get_local_data_path(['transcriptomics_data'], 'HS_CPTAC_LUAD_rnaseq_uq_rpkm_log2_NArm_TUMOR.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-LSCC/HS_CPTAC_LSCC_2020_rnaseq_uq_fpkm_log2_NArm_TUMOR.cct", 
              get_local_data_path(['transcriptomics_data'], 'HS_CPTAC_LSCC_2020_rnaseq_uq_fpkm_log2_NArm_TUMOR.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-UCEC/HS_CPTAC_UCEC_RNAseq_RSEM_UQ_log2_Tumor.cct", 
              get_local_data_path(['transcriptomics_data'], 'HS_CPTAC_UCEC_RNAseq_RSEM_UQ_log2_Tumor.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-GBM/HS_CPTAC_GBM_rnaseq_fpkm_uq_log2.cct", 
              get_local_data_path(['transcriptomics_data'], 'HS_CPTAC_GBM_rnaseq_fpkm_uq_log2.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-HNSCC/HS_CPTAC_HNSCC_RNAseq_RSEM_UQ_log2_Tumor.cct", 
              get_local_data_path(['transcriptomics_data'], 'HS_CPTAC_HNSCC_RNAseq_RSEM_UQ_log2_Tumor.cct'))



## 2. Linkedomics Proteomics 

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-BRCA/HS_CPTAC_BRCA_2018_Proteome_Ratio_Norm_gene_Median.cct", 
              get_local_data_path(['proteomics_data'], 'CPTAC_BRCA_2018_Proteome_gene_Median.cct'))


request.urlretrieve("https://linkedomics.org/data_download/CPTAC-OV/HS_CPTAC_OV_proteome_gene_tumor.cct", 
              get_local_data_path(['proteomics_data'], 'CPTAC_OV_proteome_gene_tumor.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-CCRCC/HS_CPTAC_CCRCC_proteome_Tumor.cct", 
              get_local_data_path(['proteomics_data'], 'CPTAC_CCRCC_proteome_Tumor.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-LUAD/HS_CPTAC_LUAD_proteome_ratio_NArm_TUMOR.cct", 
              get_local_data_path(['proteomics_data'], 'CPTAC_LUAD_proteome_ratio_NArm_TUMOR.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-LSCC/HS_CPTAC_LSCC_2020_proteome_ratio_NArm_TUMOR.cct", 
              get_local_data_path(['proteomics_data'], 'CPTAC_LSCC_2020_proteome_ratio_NArm_TUMOR.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-UCEC/HS_CPTAC_UCEC_Proteomics_TMT_gene_level_Tumor.cct", 
              get_local_data_path(['proteomics_data'], 'CPTAC_UCEC_Proteomics_TMT_gene_level_Tumor.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-GBM/HS_CPTAC_GBM_proteome_mssm_per_gene.cct", 
              get_local_data_path(['proteomics_data'], 'CPTAC_GBM_proteome_mssm_per_gene.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-HNSCC/HS_CPTAC_HNSCC_Proteomics_TMT_Gene_level_Tumor.cct", 
              get_local_data_path(['proteomics_data'], 'CPTAC_HNSCC_Proteomics_TMT_Gene_level_Tumor.cct'))

request.urlretrieve("https://linkedomics.org/data_download/CPTAC-PDAC/proteomics_gene_level_MD_abundance_tumor.cct", 
              get_local_data_path(['tumour_studies','pdac'], 'proteomics_gene_level_MD_abundance_tumor.cct.txt'))


## already downloaded
# request.urlretrieve("https://static-content.springer.com/esm/art%3A10.1038%2Fnature18003/MediaObjects/41586_2016_BFnature18003_MOESM111_ESM.zip", 
#               get_local_data_path(["tumour_studies", "brca", "mertins_2016"], "BrCa_2016_Supplemental_Tables.zip"))

# extract_file(get_local_data_path(["tumour_studies", "brca", "mertins_2016"], "BrCa_2016_Supplemental_Tables.zip"), 
#              get_local_data_path(["tumour_studies", "brca", "mertins_2016"]))

# shutil.move(get_local_data_path(["tumour_studies", "brca", "mertins_2016", "nature18003-s2"], 
#                                 "CPTAC_BC_SupplementaryTable03.xlsx"), 
#             get_local_data_path(["tumour_studies", "brca", "mertins_2016"], 
#                                 "CPTAC_BC_SupplementaryTable03.xlsx"))

## Returns error in programmatic access of data, so needs to be downloaded manually
# request.urlretrieve("https://www.cell.com/cms/10.1016/j.cell.2016.05.069/attachment/15e46617-bec0-42cc-82fb-71f842e8aaac/mmc3.xlsx", 
#               get_local_data_path(['tumour_studies','ovca', 'zhang_2016'], 'Table_S2.xlsx'))

## 3. Sample Info
request.urlretrieve("https://raw.githubusercontent.com/SwathiRUpadhya/PhD_thesis/main/other_data/brca_clinical_data/combined_study_clinical_data.tsv", 
              get_local_data_path(['tumour_studies','brca', 'mertins_2016'], 
                                         'combined_study_clinical_data.tsv'))

# request.urlretrieve("https://www.cell.com/cms/10.1016/j.cell.2016.05.069/attachment/04331724-1bc2-486b-819c-c38972d50492/mmc2.xlsx", 
#               get_local_data_path(['tumour_studies','ovca', 'zhang_2016'], 'Table_S1.xlsx'))

## 4. UCSC Xena - batch corrected gene expression
request.urlretrieve("https://xenabrowser.net/datapages/?dataset=EB%2B%2BAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena&host=https%3A%2F%2Fpancanatlas.xenahubs.net&removeHub=https%3A%2F%2Fucscpublic.xenahubs.net&removeHub=https%3A%2F%2Ftcga.xenahubs.net&removeHub=https%3A%2F%2Ficgc.xenahubs.net&removeHub=https%3A%2F%2Fpcawg.xenahubs.net&removeHub=https%3A%2F%2Ftoil.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443&removeHub=https%3A%2F%2Fgdc.xenahubs.net&removeHub=https%3A%2F%2Fatacseq.xenahubs.net&removeHub=https%3A%2F%2Fkidsfirst.xenahubs.net", 
                     get_local_data_path(['usc_xena_browser'], 'EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena'))

## 5. TCGA survival analysis data
request.urlretrieve("https://figshare.com/ndownloader/files/41810850", 
                    get_local_data_path(['usc_xena_browser'], 'Survival_SupplementalTable_S1_20171025_xena_sp'))

## 6. STRING DB:
request.urlretrieve("https://stringdb-downloads.org/download/protein.links.v11.5/9606.protein.links.v11.5.txt.gz", 
                    get_local_data_path(['stringdb'], '9606.protein.links.v11.5.txt.gz'))
extract_file(get_local_data_path(['stringdb'], '9606.protein.links.v11.5.txt.gz'), 
             get_local_data_path(['stringdb'], '9606.protein.links.v11.5.txt'))

request.urlretrieve("https://stringdb-downloads.org/download/protein.physical.links.v11.5/9606.protein.physical.links.v11.5.txt.gz", 
                    get_local_data_path(['stringdb'], '9606.protein.physical.links.v11.5.txt.gz'))
extract_file(get_local_data_path(['stringdb'], '9606.protein.physical.links.v11.5.txt.gz'), 
             get_local_data_path(['stringdb'], '9606.protein.physical.links.v11.5.txt'))

request.urlretrieve("https://stringdb-downloads.org/download/protein.info.v11.5/9606.protein.info.v11.5.txt.gz", 
                    get_local_data_path(['stringdb'], '9606.protein.info.v11.5.txt.gz'))
extract_file(get_local_data_path(['stringdb'], '9606.protein.info.v11.5.txt.gz'), 
             get_local_data_path(['stringdb'], '9606.protein.info.v11.5.txt'))

## 7. Approved protein-encoding genes:
request.urlretrieve("https://raw.githubusercontent.com/SwathiRUpadhya/PhD_thesis/main/other_data/hgnc/approved_protein_coding_genes.txt", 
                    get_local_data_path(['hgnc_22Feb2022'], 'approved_protein_coding_genes.txt'))


('external_data\\hgnc_22Feb2022\\approved_protein_coding_genes.txt',
 <http.client.HTTPMessage at 0x25d992d2130>)