In [1]:
import requests
import json
import re
import gzip
import pandas as pd
import tarfile
import os
import sys
import shutil
import numpy as np
import tarfile
import io
from io import StringIO

In [2]:
#Generates a folder to store the data portal gene expression data if none exits
if not os.path.exists(os.path.join(os.getcwd(),"data")):
    os.makedirs(os.getcwd(),"data")

In [74]:
class gdc_data:
    '''
    Creates data objects that can query the gdc data portal for gene expression data,
    write compressed data from portal to disk, and uncompress and store the gene expression
    data in a pandas dataframe (gene x sample_id)
    '''
    
    def __init__(self, name):
        #Initialize the type of cancer for the database query and the size of query
        self.name = name
        #initialize gene epression data matrix
        self.data = pd.DataFrame()
        #Initialize empty manifest data matrix
        self.manifest = pd.DataFrame()
        #Initialize the location for the data directory
        self.main_dir = os.path.join(os.getcwd(),"data")
        self.query_dir = os.path.join(self.main_dir,self.name)
        #Initialize empty file name
        self.file_name = ''
        #Initialize an empty http reponse
        self.response = ''
        #Initialize variable for size of query
        self.size = ''

    def data_query(self):
        '''
        Performs a query of the NCI genomic portal given a type of cancer.
        Ex. Type: Hepatocellular Carcinoma - LIHC
        Name followed by no. of samples desired. Ex. LIHC10 returns gene expression for first 10 samples 
        Returns the name of compressed tar.gz file, and a binary data file in memory
        '''
        
        files_endpt = "https://api.gdc.cancer.gov/files"
        
        cancer = ''.join([x for x in self.name if not x.isdigit()])
        size = ''.join([x for x in self.name if x.isdigit()])
        #If the size of the query was not specified, acquire data for all samples
        if not size.isdigit():
            size = 2000
    
        #Filters for the query, recieving all RNA-Seq, HTSeq-Count files for a specific cancer
        filters = {
            "op": "and",
            "content":[
                {
                "op": "in",
                "content":{
                    "field": "cases.project.project_id",
                    "value": ["TCGA-"+cancer]
                    }
                },
                {
                "op": "in",
                "content":{
                    "field": "files.experimental_strategy",
                    "value": ["RNA-Seq"]
                    }
                },
                {
                "op": "in",
                "content":{
                    "field": "files.analysis.workflow_type",
                    "value": ["HTSeq - Counts"]
                    }
                }
            ]
        }

        # Here a GET is used, so the filter parameters should be passed as a JSON string.
        params = {
            "filters": json.dumps(filters),
            "fields": "file_id",
            "format": "JSON",
            "size": size  #Set to the first 10 files for developing
            }

        response = requests.get(files_endpt, params = params)
        file_uuid_list = []

        # This step populates the download list with the file_ids from the previous query
        for file_entry in json.loads(response.content.decode("utf-8"))["data"]["hits"]:
            file_uuid_list.append(file_entry["file_id"])

        data_endpt = "https://api.gdc.cancer.gov/data"

        params = {"ids": file_uuid_list}
        #Acquire memory location of compressed data from the data portal
        response = requests.post(data_endpt, data = json.dumps(params), headers = {"Content-Type": "application/json"})
        
        response_head_cd = response.headers["Content-Disposition"]
        #Acquire the name of the file 
        file_name = re.findall("filename=(.+)", response_head_cd)[0]

        self.file_name = file_name
        self.response = response
        
    def data_write(self):
        
        #Performs data query if filename and response have not been populated yet
        if not self.file_name and not self.response:
            self.data_query()
        
        #Create a path for this query if it doesnt exist already
        if not os.path.exists(self.query_dir):
            os.makedirs(self.query_dir)
            
        #desired location of compressed data targz file
        targz = os.path.join(self.query_dir,self.file_name)    
        #Opens a file named after the file_name, and writes the contents of the query to the file    
        with open(targz, "wb") as output_file:
            output_file.write(self.response.content) #writes the response to the desired location
        
        self.data_write_targz()
           
    def data_write_targz(self):
        '''
        Uncompressess a targz file under the query directory, and writes to disk
        '''
        
        #Performs data query and writes targz if filename and response have not been populated yet
        #Check if the filename has been populated, if not then check if it exists in the query directory
        #, if neither, run query
        if not self.file_name:
            self.file_name = ''.join([x for x in os.listdir(self.query_dir) if x[-6:] == 'tar.gz'])
            if not self.file_name:
                self.data_query()
                self.data_write()
            
        #desired location of compressed data targz file
        targz = os.path.join(self.query_dir,self.file_name)  
        
        #Create a path for the uncompressed targz files if it doesnt exist already
        uncomp_targz_dir = os.path.join(self.query_dir,"uncompressed_targz")
        if not os.path.exists(uncomp_targz_dir):
            os.makedirs(uncomp_targz_dir)
        #Unzips the tar.gz file into desired folder
        with tarfile.open(targz) as tar:
            tar.extractall(uncomp_targz_dir)
            tar.close()
        #Stores the manifest of the data 
        self.manifest = pd.read_table(os.path.join(uncomp_targz_dir,"MANIFEST.txt"),sep="\t")
        
        #Create a path for this uncompressed gz files if it doesnt exist already
        uncomp_gz_dir = os.path.join(uncomp_targz_dir,"uncompressed_gz")
        if not os.path.exists(uncomp_gz_dir):
            os.makedirs(uncomp_gz_dir)
        
        #Unzips all gz gene expression files in the query directory
        for subdir, dirs, files in os.walk(self.query_dir):
            for file in files:
                if file[-4:] == "s.gz":
                    with gzip.open(os.path.join(subdir,file),'rb') as f:
                        file_content = f.read().decode("utf-8")
                        df = pd.read_csv(StringIO(file_content),sep="\t",header=None).set_index(0)
                        df.columns = [files[0]]
                        df.to_csv(os.path.join(uncomp_gz_dir,file[:-3]),header=False,sep=",",index=True)
                        
        self.data_save()
                        
    def data_save(self):
        #initialize/clear gene epression data matrix
        self.data = pd.DataFrame()
        
        uncomp_targz_dir = os.path.join(self.query_dir,"uncompressed_targz")
        uncomp_gz_dir = os.path.join(uncomp_targz_dir,"uncompressed_gz")
        #Stores the manifest of the data 
        self.manifest = pd.read_table(os.path.join(uncomp_targz_dir,"MANIFEST.txt"),sep="\t")
            
        for subdir, dirs, files in os.walk(self.query_dir):
            for file in files:
                if file[-4:] == "unts":
                    df = pd.read_csv(os.path.join(uncomp_gz_dir,file),sep=",",header=None).set_index(0)
                    df.columns = [file]
                    self.data = pd.concat([self.data,df],axis=1)
        
        self.size = self.data.shape #Store the dimensions of the data matrix
                                
    def data_add(self,data):
            self.data = data

In [83]:
test = gdc_data("LIHC")

In [84]:
test.data_write_targz()

In [85]:
test.data.iloc[:10,:50]

Unnamed: 0_level_0,004c60cf-c08e-49df-b4ce-baca41e11250.htseq.counts,0069f64b-8d8f-4426-968d-23483929ee58.htseq.counts,014b9b85-3128-416b-93d4-7ace3b676d4e.htseq.counts,03011a57-3e95-49d1-a927-cff4111d2d5b.htseq.counts,0415a9b4-a58d-4641-ab1a-927ed7a04824.htseq.counts,047bd029-d63b-4f25-8a73-b95ad72d434f.htseq.counts,04dc4da1-1d1a-46da-a9d8-da9964591aec.htseq.counts,04e7f1a4-3173-4a6f-af60-04e1f2e29868.htseq.counts,05ac7b05-e459-4833-97fc-530185a7a55f.htseq.counts,05efced9-d60f-43ea-9b61-c6efaf902e7d.htseq.counts,...,16bbd77f-b39b-4f19-9d6b-58a21dde3e84.htseq.counts,16fc8611-259d-4cd8-9e93-60d97bebb6bf.htseq.counts,179c5822-3d5a-42b9-adeb-46a2fefd3df8.htseq.counts,18c32c46-00f9-4437-bdbb-05233fdd676e.htseq.counts,18e687c3-7d76-4a4d-a665-95331732ef9c.htseq.counts,198ca92e-342e-4890-8ec3-45b045711531.htseq.counts,19f2bfb6-33ba-4aea-9281-c5009d539562.htseq.counts,1a228668-b2a0-469f-8801-fdc8be449b44.htseq.counts,1b73c4fc-b155-44ef-b91a-ef7dfdb6a5e6.htseq.counts,1b78239d-edb0-4a9c-9036-49509ae0bcb9.htseq.counts
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003.13,2155,3871,5085,6070,3231,6762,2285,3395,6637,3560,...,2422,5578,3704,9069,4838,6708,5459,8909,3053,3616
ENSG00000000005.5,1,4,16,1,0,0,1,1,1,2,...,0,0,3,3,0,1,0,1,0,0
ENSG00000000419.11,405,1133,1326,1047,1130,1079,1105,1302,476,1174,...,380,1681,1101,3776,1919,1494,1220,2590,1392,472
ENSG00000000457.12,170,727,587,327,638,834,585,853,184,461,...,302,958,913,1302,1184,714,533,759,885,621
ENSG00000000460.15,49,167,1528,235,340,204,119,451,91,127,...,84,480,622,251,526,354,165,696,636,82
ENSG00000000938.11,23,77,275,246,306,222,30,315,116,196,...,147,267,99,116,328,458,134,825,81,98
ENSG00000000971.14,18706,91848,120808,148950,23153,32093,57084,93174,33089,58283,...,44404,148451,119274,53085,29285,43721,64706,36911,70139,15986
ENSG00000001036.12,3365,4954,3721,3148,3497,5544,1903,3407,1423,3904,...,1389,5978,444,6335,3545,4212,3015,14202,2870,4374
ENSG00000001084.9,1616,9951,6941,2843,4088,4203,787,5080,2318,5136,...,2449,5884,14344,8113,1733,6372,6544,5479,7985,1949
ENSG00000001167.13,332,1090,655,391,890,1725,246,2139,350,560,...,374,1159,1435,1693,3156,1280,1680,2977,732,893


In [86]:
test.size

(60488, 424)