# GDC Playground

## GDC API Usages

In [1]:
from gdc_api import GDCSchema

gdc = GDCSchema("secondhand_smoke")
gdc.get_gdc_candidates()

{'exposure::secondhand_smoke_as_child': {'score': 0.88,
  'description': 'The text term used to indicate whether the patient was exposed to secondhand smoke as a child.'},
 'molecular_test::second_exon': {'score': 0.7565235690235691,
  'description': 'The second exon number involved in molecular variation. If a specific genetic variant is being reported, this property can be used to capture the second exon where that variant is located. This property is typically used for a translocation where two different locations are involved in the variation.'},
 'diagnosis::secondary_gleason_grade': {'score': 0.7294685990338164,
  'description': "The text term used to describe the secondary Gleason score, which describes the pattern of cells making up the second largest area of the tumor. The primary and secondary Gleason pattern grades are combined to determine the patient's Gleason grade group, which is used to determine the aggresiveness of prostate cancer. Note that this grade describes the e

In [2]:
gdc.get_properties_by_gdc_candidate('exposure::secondhand_smoke_as_child')
print(f"GDC Column type: {gdc.get_gdc_col_type()}")
print(f"GDC Column values: {gdc.get_gdc_col_values()}")

GDC Column type: enum
GDC Column values: ['Yes', 'No', 'Unknown', 'Not Reported']


## Web Requests

In [None]:
import requests
import json

cases_endpt = 'https://api.gdc.cancer.gov/v0/submission/template/case?format=tsv'

# The 'fields' parameter is passed as a comma-separated string of single names.
# fields = [
#     "submitter_id",
#     "case_id",
#     "primary_site",
#     "disease_type",
#     "diagnoses.vital_status"
#     ]

# fields = ','.join(fields)

# params = {
#     "fields": fields,
#     "format": "TSV",
#     "size": "100"
#     }

response = requests.get(cases_endpt)

print(response.content)

## Python

In [1]:
from gdcdictionary.python import GDCDictionary, gdcdictionary

In [2]:
gdcdictionary.schema.keys()

dict_keys(['germline_mutation_calling_workflow', 'copy_number_estimate', 'analyte', 'pathology_detail', 'aligned_reads', 'program', 'other_clinical_attribute', 'read_group_qc', 'aliquot', 'genomic_profile_harmonization_workflow', 'copy_number_segment', 'archive', 'simple_somatic_mutation', 'filtered_copy_number_segment', 'methylation_liftover_workflow', 'protein_expression', 'project', 'case', 'pathology_report', 'root', 'somatic_annotation_workflow', 'structural_variation', 'secondary_expression_analysis', 'submitted_unaligned_reads', 'slide_image', 'data_subtype', 'submitted_tangent_copy_number', 'rna_expression_workflow', 'somatic_copy_number_workflow', 'somatic_mutation_index', 'copy_number_variation_workflow', 'clinical_supplement', 'alignment_workflow', 'gene_expression', 'treatment', 'portion', 'structural_variant_calling_workflow', 'submitted_genotyping_array', 'molecular_test', 'copy_number_liftover_workflow', 'tissue_source_site', 'methylation_array_harmonization_workflow', '

In [3]:
for key in gdcdictionary.schema.keys():
    print(gdcdictionary.schema[key]['properties'].keys())

dict_keys(['workflow_type', 'aligned_reads_files', 'submitted_genotyping_arrays', 'workflow_link', 'workflow_version', 'workflow_start_datetime', 'workflow_end_datetime', 'type', 'id', 'submitter_id', 'batch_id', 'state', 'project_id', 'created_datetime', 'updated_datetime'])
dict_keys(['data_category', 'data_type', 'data_format', 'experimental_strategy', 'platform', 'copy_number_variation_workflows', 'genomic_profile_harmonization_workflows', 'somatic_copy_number_workflows', 'file_name', 'file_size', 'md5sum', 'file_state', 'error_type', 'state_comment', 'type', 'id', 'submitter_id', 'batch_id', 'state', 'project_id', 'created_datetime', 'updated_datetime'])
dict_keys(['a260_a280_ratio', 'amount', 'analyte_quantity', 'analyte_type', 'analyte_type_id', 'analyte_volume', 'concentration', 'experimental_protocol_type', 'normal_tumor_genotype_snp_match', 'ribosomal_rna_28s_16s_ratio', 'rna_integrity_number', 'spectrophotometer_method', 'well_number', 'portions', 'samples', 'type', 'id', 's

In [5]:
gdcdictionary.schema['mirna_expression_workflow']['properties']['project_id']

{'type': 'string',
 'common': {'description': 'Unique ID for any specific defined piece of work that is undertaken or attempted to meet a single requirement.',
  'termDef': {'term': None,
   'source': None,
   'cde_id': None,
   'cde_version': None,
   'term_url': None}}}

## Read From Yaml (Silly Solution)

In [8]:
from git import Repo  # pip install gitpython

git_url = "https://github.com/NCI-GDC/gdcdictionary.git"

Repo.clone_from(git_url, "gdc/")

<git.repo.base.Repo '/home/yfw215/alpha-automl/examples/gdc/.git'>

In [50]:
import yaml
import os

gdc_dictionary_path = "gdc/src/gdcdictionary/schemas"
files = os.listdir(gdc_dictionary_path)

yamls = [ fi for fi in files if fi.endswith(".yaml") ]

with open(os.path.join(gdc_dictionary_path, 'demographic.yaml'), 'r') as file:
    prime_service = yaml.safe_load(file)
    
prime_service

{'$schema': 'http://json-schema.org/draft-04/schema#',
 'id': 'demographic',
 'title': 'Demographic',
 'type': 'object',
 'namespace': 'https://gdc.cancer.gov',
 'category': 'clinical',
 'project': '*',
 'program': '*',
 'description': 'Data for the characterization of the patient by means of segmenting the population (e.g., characterization by age, sex, or race).',
 'additionalProperties': False,
 'submittable': True,
 'downloadable': False,
 'previous_version_downloadable': False,
 'validators': None,
 'systemProperties': ['id',
  'project_id',
  'created_datetime',
  'updated_datetime',
  'state'],
 'links': [{'name': 'cases',
   'backref': 'demographics',
   'label': 'describes',
   'target_type': 'case',
   'multiplicity': 'one_to_one',
   'required': True}],
 'required': ['submitter_id', 'ethnicity', 'gender', 'race', 'vital_status'],
 'uniqueKeys': [['id'], ['project_id', 'submitter_id']],
 'deprecated': ['premature_at_birth', 'weeks_gestation_at_birth'],
 'properties': {'$ref':

In [36]:
prime_service['properties'].keys()

dict_keys(['$ref', 'age_at_index', 'age_is_obfuscated', 'cause_of_death', 'cause_of_death_source', 'country_of_birth', 'country_of_residence_at_enrollment', 'days_to_birth', 'days_to_death', 'education_level', 'ethnicity', 'gender', 'marital_status', 'occupation_duration_years', 'premature_at_birth', 'race', 'vital_status', 'weeks_gestation_at_birth', 'year_of_birth', 'year_of_death', 'cases'])

In [51]:
properties = prime_service['properties']

def process_properties(properties):
    filtered_properties = {}
    for key in properties:
        if 'type' in properties[key]:
            filtered_dict = properties[key]
            del filtered_dict['$ref']
        elif 'enum' in properties[key]:
            filtered_dict = {}
            filtered_dict['type'] = 'enum'
            filtered_dict['enum'] = properties[key]['enum']
        else:
            continue
        filtered_properties[key] = filtered_dict
    
    return filtered_properties

process_properties(properties)        

{'age_is_obfuscated': {'type': 'boolean'},
 'cause_of_death': {'type': 'enum',
  'enum': ['Cancer Related',
   'Cardiovascular Disorder, NOS',
   'End-stage Renal Disease',
   'Infection',
   'Not Cancer Related',
   'Renal Disorder, NOS',
   'Spinal Muscular Atrophy',
   'Surgical Complications',
   'Toxicity',
   'Unknown',
   'Not Reported']},
 'cause_of_death_source': {'type': 'enum',
  'enum': ['Autopsy',
   'Death Certificate',
   'Medical Record',
   'Social Security Death Index',
   'Unknown',
   'Not Reported']},
 'country_of_birth': {'type': 'enum',
  'enum': ['Afghanistan',
   'Albania',
   'Algeria',
   'Andorra',
   'Angola',
   'Anguilla',
   'Antigua and Barbuda',
   'Argentina',
   'Armenia',
   'Aruba',
   'Australia',
   'Austria',
   'Azerbaijan',
   'Bahamas',
   'Bahrain',
   'Bangladesh',
   'Barbados',
   'Belarus',
   'Belgium',
   'Belize',
   'Benin',
   'Bermuda',
   'Bhutan',
   'Bolivia',
   'Bosnia and Herzegovina',
   'Botswana',
   'Brazil',
   'Brunei',