In [13]:
from pyspark.sql import SparkSession, Row
import __credential__
from os import environ
environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.10:0.4.1 pyspark-shell' 
environ['PYSPARK_PYTHON']='/home/ubuntu/anaconda3/bin/python'
environ['PYSPARK_DRIVER_PYTHON']='/home/ubuntu/anaconda3/bin/jupyter'

In [14]:
spark = SparkSession \
    .builder \
    .master(__credential__.spark_host) \
    .appName("meta_info") \
    .getOrCreate()
    

### Read meta data file

In [15]:
meta = spark.read.json('s3a://gdcdata/refs/files.c+r.json', multiLine=True)
meta.printSchema()
meta.createOrReplaceTempView("meta_view")

root
 |-- access: string (nullable = true)
 |-- annotations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotation_id: string (nullable = true)
 |-- cases: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- case_id: string (nullable = true)
 |    |    |-- project: struct (nullable = true)
 |    |    |    |-- project_id: string (nullable = true)
 |-- data_category: string (nullable = true)
 |-- data_format: string (nullable = true)
 |-- file_name: string (nullable = true)
 |-- file_size: long (nullable = true)



In [16]:
# meta_rdd is PairRDD for later joining
# NOTE: if you see `element` in Json schema, just ignore it.
meta_rdd = spark.sql('''SELECT file_name, data_format, \
                        cases.project.project_id, cases.case_id FROM meta_view''') \
            .rdd.map(lambda x : (x.file_name, x))
print(meta_rdd.count())
meta_rdd.take(5)

4763


[('nationwidechildrens.org_clinical.TCGA-AN-A0FZ.xml',
  Row(file_name='nationwidechildrens.org_clinical.TCGA-AN-A0FZ.xml', data_format='BCR XML', project_id=['TCGA-BRCA'], case_id=['8240c4ae-f878-4882-aa07-7857c7ac52cf'])),
 ('148d950b-4202-4f3f-be15-84735cd08a48.htseq.counts.gz',
  Row(file_name='148d950b-4202-4f3f-be15-84735cd08a48.htseq.counts.gz', data_format='TXT', project_id=['TCGA-BRCA'], case_id=['f3cb557d-23e4-4fd1-81ca-db1a3f56d56e'])),
 ('a2233404-f380-4bd6-8770-0d4ba8998fd9.htseq.counts.gz',
  Row(file_name='a2233404-f380-4bd6-8770-0d4ba8998fd9.htseq.counts.gz', data_format='TXT', project_id=['TCGA-BRCA'], case_id=['ee5744c0-a8dc-43f2-abdd-0b5529a3e9fb'])),
 ('b2a6c9e3-65eb-43bd-849a-5829007379ab.FPKM-UQ.txt.gz',
  Row(file_name='b2a6c9e3-65eb-43bd-849a-5829007379ab.FPKM-UQ.txt.gz', data_format='TXT', project_id=['TCGA-BRCA'], case_id=['c0b7b798-3383-4a45-a455-9eca5810739e'])),
 ('99c7b545-c90e-407f-9086-0ef8e108f973.FPKM.txt.gz',
  Row(file_name='99c7b545-c90e-407f-9086-0

### Read manifest data file

In [17]:
manifest = spark.read.format("csv")\
    .option("delimiter","\t").option("quote","")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load('s3a://gdcdata/refs/gdc_manifest.c+r.txt')
manifest.printSchema()

root
 |-- id: string (nullable = true)
 |-- filename: string (nullable = true)
 |-- md5: string (nullable = true)
 |-- size: integer (nullable = true)
 |-- state: string (nullable = true)



In [18]:
# PairRDD for later joining
manifest_rdd = manifest.rdd \
            .map(lambda x:(x.filename, Row(uuid=x.id, md5=x.md5)))
print(manifest_rdd.count())
manifest_rdd.take(5)

4763


[('nationwidechildrens.org_clinical.TCGA-AN-A0FZ.xml',
  Row(md5='573039b77776cf1f403fd8f5c941e01e', uuid='9669a175-8199-4eff-b589-feac4b09da60')),
 ('148d950b-4202-4f3f-be15-84735cd08a48.htseq.counts.gz',
  Row(md5='55199ed8fa96002d97f24085b5986b4c', uuid='60aec852-6075-4464-a0f3-eee105cdb24f')),
 ('a2233404-f380-4bd6-8770-0d4ba8998fd9.htseq.counts.gz',
  Row(md5='e15567e9a0eafeb5ae9c86c1300f4e17', uuid='8dc57eac-45aa-4afd-8c6d-b2825b287f3d')),
 ('b2a6c9e3-65eb-43bd-849a-5829007379ab.FPKM-UQ.txt.gz',
  Row(md5='6e9416ae62175c9036c1d4e3d76397b1', uuid='a9795a96-c066-467a-9568-bf7d5a4fea95')),
 ('99c7b545-c90e-407f-9086-0ef8e108f973.FPKM.txt.gz',
  Row(md5='2e2a2206664b64d002911cc86d335d2e', uuid='7a02938c-644c-487c-b508-ffc3a6afabe0'))]

### Join meta data and manifest data to obatin full path to files

In [19]:
# Inner join on meta_rdd and manifest_rdd
index_rdd = meta_rdd.join(manifest_rdd)
index_rdd = index_rdd.map(lambda x : Row( \
                            file_name = x[1][1].uuid + '/' + x[1][0].file_name, \
                            data_format = x[1][0].data_format, \
                            project_id = x[1][0].project_id, \
                            case_id = x[1][0].case_id))
print(index_rdd.count())
index_rdd.take(5)

4763


[Row(case_id=['f3cb557d-23e4-4fd1-81ca-db1a3f56d56e'], data_format='TXT', file_name='60aec852-6075-4464-a0f3-eee105cdb24f/148d950b-4202-4f3f-be15-84735cd08a48.htseq.counts.gz', project_id=['TCGA-BRCA']),
 Row(case_id=['ee5744c0-a8dc-43f2-abdd-0b5529a3e9fb'], data_format='TXT', file_name='8dc57eac-45aa-4afd-8c6d-b2825b287f3d/a2233404-f380-4bd6-8770-0d4ba8998fd9.htseq.counts.gz', project_id=['TCGA-BRCA']),
 Row(case_id=['d8492ebd-3d94-4a4b-8de7-386526691cc2'], data_format='TXT', file_name='7a02938c-644c-487c-b508-ffc3a6afabe0/99c7b545-c90e-407f-9086-0ef8e108f973.FPKM.txt.gz', project_id=['TCGA-BRCA']),
 Row(case_id=['ca76c6fc-ff30-4833-884d-3a94fe53ff17'], data_format='TXT', file_name='abe158c1-ba3c-470d-8eeb-ab9867aed874/1c860457-48e6-4a74-a455-e61bc10c6b32.FPKM-UQ.txt.gz', project_id=['TCGA-BRCA']),
 Row(case_id=['a6cb373a-1604-4e25-bd81-da6c3e4f06ee'], data_format='TXT', file_name='bf21b5f9-b475-4c99-977a-9a5e8720f715/e512f861-c59f-4272-b528-7424a882c795.htseq.counts.gz', project_id=[

### Group by data format

In [20]:
index_rdd_groupby = index_rdd.groupBy(lambda x: x.data_format)
index_rdd_groupby.take(5)

[('TXT', <pyspark.resultiterable.ResultIterable at 0x7fdbe6c3a128>),
 ('BCR XML', <pyspark.resultiterable.ResultIterable at 0x7fdbe68f9d68>)]

### Partition and read files

In [25]:
def xml_reader(entry):
    # Test
    # Check module_xml_reader.ipynb for full function
    
    xml_schema = spark.read.format('com.databricks.spark.xml') \
    .options(rowTag='brca:patient').load('s3a://gdcdata/datasets/' + entry.file_name)
    """
    xml_schema_rdd = xml_schema.rdd.map(lambda x:Row( \
                    stage = x['shared_stage:stage_event']['shared_stage:pathologic_stage']._VALUE, \
                    primary_site = x['clin_shared:tumor_tissue_site']._VALUE, \
                    gender = x['shared:gender']._VALUE))

    # If patient ID not in Redshift: create table and dump: patient_id, project_id, primary_site, stage, gender
    # else: update project_id, primary_site, stage, gender
    """
    return 'xml: ' + str(entry.file_name)

def txt_reader(entry):
    # Check module_txt_reader.ipynb for full function
    # Parse txt file - no header, specifying schema
    # Transform to rdd
    # Create Redshift table with name: case_id, key: gene, val: gene expression
    if "FPKM-UQ" not in entry.file_name:
        return
    
    return 'txt: ' + str(entry.file_name)

In [26]:
FormatMapper = {'BCR XML': xml_reader, 'TXT': txt_reader}

def format_partitioner(format_group):
    # Map different group with file format to different file reader
    try:
        map(lambda \
                x:FormatMapper[format_group[0]](x), format_group[1])
        
        # Return for test output
        return list(map(lambda \
                x:FormatMapper[format_group[0]](x), format_group[1]))
    
    except KeyError:
        print("File format not known.")
        return "FORMAT_ERROR"
    
index_rdd_groupby.map(format_partitioner).collect()

Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/insight36k/lib/python3.6/site-packages/pyspark/cloudpickle.py", line 148, in dump
    return Pickler.dump(self, obj)
  File "/home/ubuntu/anaconda3/envs/insight36k/lib/python3.6/pickle.py", line 409, in dump
    self.save(obj)
  File "/home/ubuntu/anaconda3/envs/insight36k/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/home/ubuntu/anaconda3/envs/insight36k/lib/python3.6/pickle.py", line 751, in save_tuple
    save(element)
  File "/home/ubuntu/anaconda3/envs/insight36k/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/home/ubuntu/anaconda3/envs/insight36k/lib/python3.6/site-packages/pyspark/cloudpickle.py", line 255, in save_function
    self.save_function_tuple(obj)
  File "/home/ubuntu/anaconda3/envs/insight36k/lib/python3.6/site-packages/pyspark/cloudpickle.py", line 292, in save_function_t

PicklingError: Could not serialize object: Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.

### Random Test

In [None]:
a = Row(txt=5, xml=10)

In [None]:
tuple(a)

In [None]:
def f(x):
    s = ""
    for item in x[1]:
        s = s + str(item.case_id) + " "
        return s