## Firehose Dataset Parser

Official website of Firehose http://firebrowse.org/
Official website of Broad Institute from where we can download all the datasets http://gdac.broadinstitute.org/#

This tool automate the downloading of datasets from the broad institute official site. We have to choose the cancer tyoe from the available cancer types and it will download the datasets of corresponding cancer type. 

In [72]:
# Import all the necessary libraries
import ipywidgets as widgets
from IPython.display import display
import requests
import os

In [73]:
# All available cancer types in Broad Institutes official site
cancers = [('Adrenocortical carcinoma', 'ACC'),
('Bladder Urothelial Carcinoma', 'BLCA'),
('Breast invasive carcinoma', 'BRCA'),
('Cervical squamous cell carcinoma and endocervical adenocarcinoma', 'CESC'),
('Cholangiocarcinoma', 'CHOL'),
('Colon adenocarcinoma', 'COAD'),
('Colorectal adenocarcinoma', 'COADREAD'),
('Lymphoid Neoplasm Diffuse Large B-cell Lymphoma', 'DLBC'),
('Esophageal carcinoma', 'ESCA'),
('FFPE Pilot Phase II', 'FPPP'),
('Glioblastoma multiforme', 'GBM'),
('Glioma', 'GBMLGG'),
('Head and Neck squamous cell carcinoma', 'HNSC'),
('Kidney Chromophobe', 'KICH'),
('Pan-kidney cohort (KICH+KIRC+KIRP)', 'KIPAN'),
('Kidney renal clear cell carcinoma', 'KIRC'),
('Kidney renal papillary cell carcinoma', 'KIRP'),
('Acute Myeloid Leukemia', 'LAML'),
('Brain Lower Grade Glioma', 'LGG'),
('Liver hepatocellular carcinoma', 'LIHC'),
('Lung adenocarcinoma', 'LUAD'),
('Lung squamous cell carcinoma', 'LUSC'),
('Mesothelioma', 'MESO'),
('Ovarian serous cystadenocarcinoma', 'OV'),
('Pancreatic adenocarcinoma', 'PAAD'),
('Pheochromocytoma and Paraganglioma', 'PCPG'),
('Prostate adenocarcinoma', 'PRAD'),
('Rectum adenocarcinoma', 'READ'),
('Sarcoma', 'SARC'),
('Skin Cutaneous Melanoma', 'SKCM'),
('Stomach adenocarcinoma', 'STAD'),
('Stomach and Esophageal carcinoma', 'STES'),
('Testicular Germ Cell Tumors', 'TGCT'),
('Thyroid carcinoma', 'THCA'),
('Thymoma', 'THYM'),
('Uterine Corpus Endometrial Carcinoma', 'UCEC'),
('Uterine Carcinosarcoma', 'UCS'),
('Uveal Melanoma', 'UVM')]

In [93]:
# Function for safe directory creation
def create_dir(path):
    try:
        os.mkdir(path)
    except Exception as ex:
        print('Directory already exist!')

In [95]:
# Prepare directories here
# Create /data directory if not exist. All datasets will be placed in this directory
create_dir('data')

Directory already exist!


In [105]:
# Helper function to download file and save into /data directory
def download_data(url):
    try:
        sp_url = url.split('/') # Url Array
        
        # Create directory for particular cancer
        cancer_path = 'data/' + sp_url[-3]
        print(cancer_path)
        create_dir(cancer_path)
        
        # Request instance for downloading file
        req = requests.get(url, allow_redirects=True)
        
        # Format file name
        file_name = sp_url[-3] + '__' + sp_url[-2] + '__' + sp_url[-1] 
    
        # Setup file path
        data_path = cancer_path + '/' + file_name
        print(data_path)
        open(data_path, 'wb').write(req.content)
        
    except Exception as ex:
        print("File not found!")

In [110]:
def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        selected_cancer = change['new']
        dataset_url = "http://gdac.broadinstitute.org/runs/analyses__2016_01_28/data/"+selected_cancer+"-TP/20160128/gdac.broadinstitute.org_"+selected_cancer+"-TP.CopyNumber_Gistic2.Level_4.2016012800.0.0.tar.gz"

        del_url = "http://gdac.broadinstitute.org/runs/analyses__2016_01_28/reports/cancer/"+selected_cancer+"-TP/CopyNumber_Gistic2/del_genes.conf_99.txt"
        amp_url = "http://gdac.broadinstitute.org/runs/analyses__2016_01_28/reports/cancer/"+selected_cancer+"-TP/CopyNumber_Gistic2/amp_genes.conf_99.txt"
        arm_url = "http://gdac.broadinstitute.org/runs/analyses__2016_01_28/reports/cancer/"+selected_cancer+"-TP/CopyNumber_Gistic2/broad_significance_results.txt"
        
        print("changed to %s" % change['new'])
        print(del_url)
        print(amp_url)
        download_data(del_url)
        download_data(amp_url)
        download_data(arm_url)
        

In [111]:
w = widgets.Dropdown(
    options = cancers,
    value='GBM',
    description='Cancers DropDown List',
)

w.observe(on_change)

display(w)

Dropdown(description='Cancers DropDown List', index=10, options=(('Adrenocortical carcinoma', 'ACC'), ('Bladde…

changed to BRCA
http://gdac.broadinstitute.org/runs/analyses__2016_01_28/reports/cancer/BRCA-TP/CopyNumber_Gistic2/del_genes.conf_99.txt
http://gdac.broadinstitute.org/runs/analyses__2016_01_28/reports/cancer/BRCA-TP/CopyNumber_Gistic2/amp_genes.conf_99.txt
data/BRCA-TP
Directory already exist!
data/BRCA-TP/BRCA-TP__CopyNumber_Gistic2__del_genes.conf_99.txt
data/BRCA-TP
Directory already exist!
data/BRCA-TP/BRCA-TP__CopyNumber_Gistic2__amp_genes.conf_99.txt
data/BRCA-TP
Directory already exist!
data/BRCA-TP/BRCA-TP__CopyNumber_Gistic2__broad_significance_results.txt
