## Firehose Dataset Parser

Official website of Firehose http://firebrowse.org/
Official website of Broad Institute from where we can download all the datasets http://gdac.broadinstitute.org/#

This tool automate the downloading of datasets from the broad institute official site. We have to choose the cancer tyoe from the available cancer types and it will download the datasets of corresponding cancer type. 

In [None]:
# Import all the necessary libraries
import ipywidgets as widgets
from IPython.display import display
import requests
import os

In [None]:
# All available cancer types in Broad Institutes official site
cancers = [('Adrenocortical carcinoma', 'ACC'),
('Bladder Urothelial Carcinoma', 'BLCA'),
('Breast invasive carcinoma', 'BRCA'),
('Cervical squamous cell carcinoma and endocervical adenocarcinoma', 'CESC'),
('Cholangiocarcinoma', 'CHOL'),
('Colon adenocarcinoma', 'COAD'),
('Colorectal adenocarcinoma', 'COADREAD'),
('Lymphoid Neoplasm Diffuse Large B-cell Lymphoma', 'DLBC'),
('Esophageal carcinoma', 'ESCA'),
('FFPE Pilot Phase II', 'FPPP'),
('Glioblastoma multiforme', 'GBM'),
('Glioma', 'GBMLGG'),
('Head and Neck squamous cell carcinoma', 'HNSC'),
('Kidney Chromophobe', 'KICH'),
('Pan-kidney cohort (KICH+KIRC+KIRP)', 'KIPAN'),
('Kidney renal clear cell carcinoma', 'KIRC'),
('Kidney renal papillary cell carcinoma', 'KIRP'),
('Acute Myeloid Leukemia', 'LAML'),
('Brain Lower Grade Glioma', 'LGG'),
('Liver hepatocellular carcinoma', 'LIHC'),
('Lung adenocarcinoma', 'LUAD'),
('Lung squamous cell carcinoma', 'LUSC'),
('Mesothelioma', 'MESO'),
('Ovarian serous cystadenocarcinoma', 'OV'),
('Pancreatic adenocarcinoma', 'PAAD'),
('Pheochromocytoma and Paraganglioma', 'PCPG'),
('Prostate adenocarcinoma', 'PRAD'),
('Rectum adenocarcinoma', 'READ'),
('Sarcoma', 'SARC'),
('Skin Cutaneous Melanoma', 'SKCM'),
('Stomach adenocarcinoma', 'STAD'),
('Stomach and Esophageal carcinoma', 'STES'),
('Testicular Germ Cell Tumors', 'TGCT'),
('Thyroid carcinoma', 'THCA'),
('Thymoma', 'THYM'),
('Uterine Corpus Endometrial Carcinoma', 'UCEC'),
('Uterine Carcinosarcoma', 'UCS'),
('Uveal Melanoma', 'UVM')]

In [None]:
# Function for safe directory creation
def create_dir(path):
    try:
        os.mkdir(path)
    except Exception as ex:
        print('Directory already exist!')

In [None]:
# Prepare directories here
# Create /data directory if not exist. All datasets will be placed in this directory
create_dir('data')

### download_data(url) function downloads file using requests library of python
All data files are downloaded in the /data directory

In [None]:
# Helper function to download file and save into /data directory
def download_data(url):
    try:
        sp_url = url.split('/') # Url Array
        
        # Create directory for particular cancer
        cancer_path = 'data/' + sp_url[-3]
        print(cancer_path)
        create_dir(cancer_path)
        
        # Request instance for downloading file
        req = requests.get(url, allow_redirects=True)
        
        # Format file name
        file_name = sp_url[-3] + '__' + sp_url[-2] + '__' + sp_url[-1] 
    
        # Setup file path
        data_path = cancer_path + '/' + file_name
        print(data_path)
        open(data_path, 'wb').write(req.content)
        
    except Exception as ex:
        print("File not found!")

### There was a common pattern in download url
Using those common pattern of the download url, we can construct a download url for different datasets

In [None]:
def download_copy_number(selected_cancer):
    dataset_url = "http://gdac.broadinstitute.org/runs/analyses__2016_01_28/data/" + selected_cancer + "-TP/20160128/gdac.broadinstitute.org_" + selected_cancer + "-TP.CopyNumber_Gistic2.Level_4.2016012800.0.0.tar.gz"

    del_url = "http://gdac.broadinstitute.org/runs/analyses__2016_01_28/reports/cancer/" + selected_cancer + "-TP/CopyNumber_Gistic2/del_genes.conf_99.txt"
    amp_url = "http://gdac.broadinstitute.org/runs/analyses__2016_01_28/reports/cancer/" + selected_cancer + "-TP/CopyNumber_Gistic2/amp_genes.conf_99.txt"
    arm_url = "http://gdac.broadinstitute.org/runs/analyses__2016_01_28/reports/cancer/" + selected_cancer + "-TP/CopyNumber_Gistic2/broad_significance_results.txt"
    broad_arm_url = "http://gdac.broadinstitute.org/runs/analyses__2016_01_28/reports/cancer/" + selected_cancer + "-TP/CopyNumber_Gistic2/broad_values_by_arm.txt"

    
    print(dataset_url)
    print(del_url)
    print(amp_url)
    print(arm_url)
    print(broad_arm_url)

    download_data(del_url)
    download_data(amp_url)
    download_data(arm_url)
    download_data(broad_arm_url)
    download_data(dataset_url)
    

### Mutation Analysis datasets also has similar type of symmetricity

In [None]:
def download_mutsig(selected_cancer):
    dataset_url = "http://gdac.broadinstitute.org/runs/analyses__2016_01_28/data/" + selected_cancer + "-TP/20160128/gdac.broadinstitute.org_" + selected_cancer + "-TP.MutSigNozzleReport2CV.Level_4.2016012800.0.0.tar.gz"
    mut_url = "http://gdac.broadinstitute.org/runs/analyses__2016_01_28/reports/cancer/" + selected_cancer + "-TP/MutSigNozzleReport2CV/sig_genes.txt"
    
    print(dataset_url)
    print(mut_url)

    download_data(mut_url)
    download_data(dataset_url)
    

### on_change(change) function is a event trigger function
This function triggers when the user change the type of cancer in dropdown menu. As soon as user change the cancer type from the dropdown this function downloads datasets.

In [None]:
def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        selected_cancer = change['new']
        print("changed to %s" % selected_cancer)
        
        download_copy_number(selected_cancer)
        download_mutsig(selected_cancer) 

Now let's create a dropdown menu for all the types of cancers available in broad firehose website

In [None]:
w = widgets.Dropdown(
    options = cancers,
    value='GBM',
    description='Cancers DropDown List',
)

w.observe(on_change)

display(w)

## Using panda, and numpy

In [None]:
import pandas as pd
import numpy as np

In [32]:
df = pd.read_csv('data/TGCT.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt', sep='\s+', header=None, error_bad_lines=False, index_col=False, dtype='unicode')
# df = pd.read_fwf('cancers.txt')
df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,147,148,149,150,151,152,153,154,155,156
0,HybridizationREF,TCGA-2G-AAEW-01A-11R-A430-07,TCGA-2G-AAEX-01A-11R-A430-07,TCGA-2G-AAF1-01A-11R-A430-07,TCGA-2G-AAF4-01A-11R-A430-07,TCGA-2G-AAF6-01A-11R-A430-07,TCGA-2G-AAF8-01A-11R-A430-07,TCGA-2G-AAFE-01A-11R-A430-07,TCGA-2G-AAFG-05A-11R-A430-07,TCGA-2G-AAFG-01A-11R-A430-07,...,TCGA-YU-AA4L-01A-11R-A431-07,TCGA-YU-AA61-01A-11R-A431-07,TCGA-ZM-AA05-01A-12R-A431-07,TCGA-ZM-AA06-01A-12R-A431-07,TCGA-ZM-AA0B-01A-11R-A431-07,TCGA-ZM-AA0D-01A-11R-A431-07,TCGA-ZM-AA0E-01A-12R-A431-07,TCGA-ZM-AA0F-01A-21R-A431-07,TCGA-ZM-AA0H-01A-11R-A431-07,TCGA-ZM-AA0N-01A-21R-A431-07
1,gene_id,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,...,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count
2,?|100130426,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.4331,0.0000,0.0000,...,0.0000,0.0000,2.5940,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
3,?|100133144,37.5747,18.1565,32.7207,53.4791,26.0882,23.5959,28.1624,17.6078,27.4481,...,4.9700,10.7963,5.0324,9.5604,4.0650,8.2486,4.0404,0.0000,4.5530,10.7214
4,?|100134869,25.9327,6.7824,23.4881,37.5214,13.7144,8.6863,16.7499,13.4929,12.2404,...,15.7610,6.5147,2.7497,10.5861,1.3550,4.4586,0.0000,8.7549,2.2652,7.0366


In [None]:
f1 = open("data/TGCT.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt", "r")
print(f1.readline())
print(f1.readline())
print(f1.readline())