# GDC Playground

## GDC API Usages

### Explore ENUM Sync

In [1]:
from gdc_api import GDCSchema
import pprint

gdc = GDCSchema()

enums = gdc.extract_enums(subschemas=['demographic', 'diagnosis', 'sample'])

pprint.pp(list(enums.keys()))

['biospecimen_anatomic_site',
 'biospecimen_laterality',
 'composition',
 'diagnosis_pathologically_confirmed',
 'distance_normal_to_tumor',
 'method_of_sample_procurement',
 'preservation_method',
 'sample_type',
 'sample_type_id',
 'specimen_type',
 'tissue_collection_type',
 'tissue_type',
 'tumor_code',
 'tumor_code_id',
 'tumor_descriptor',
 'adrenal_hormone',
 'ajcc_clinical_m',
 'ajcc_clinical_n',
 'ajcc_clinical_stage',
 'ajcc_clinical_t',
 'ajcc_pathologic_m',
 'ajcc_pathologic_n',
 'ajcc_pathologic_stage',
 'ajcc_pathologic_t',
 'ajcc_staging_system_edition',
 'ann_arbor_b_symptoms',
 'ann_arbor_b_symptoms_described',
 'ann_arbor_clinical_stage',
 'ann_arbor_extranodal_involvement',
 'ann_arbor_pathologic_stage',
 'best_overall_response',
 'burkitt_lymphoma_clinical_variant',
 'cancer_detection_method',
 'child_pugh_classification',
 'clark_level',
 'classification_of_tumor',
 'cog_liver_stage',
 'cog_neuroblastoma_risk_group',
 'cog_renal_stage',
 'cog_rhabdomyosarcoma_risk_

In [2]:
import pandas as pd

dou_groundtruth = pd.read_csv("../data/table-matching-ground-truth/ground-truth/Dou.csv")
dou = pd.read_excel("../data/datalake/Dou.xlsx", sheet_name="UCEC_CPTAC3_meta_table_V2.1")
dou = dou[dou_groundtruth["original_paper_variable_names"].to_list()]

dou_enums = dou.apply(pd.unique).to_dict()

for key, values in dou_enums.items():
    dou_enums[key] = list(set([str(value).lower() for value in values]))
    if 'nan' in dou_enums[key]:
        dou_enums[key].remove('nan')


In [4]:
import jellyfish
from fuzzywuzzy import process

# choices = enums["ajcc_pathologic_t"]
# for value in dou_enums["Path_Stage_Primary_Tumor-pT"]:
#     print(f"{value}: {process.extract(value, choices, limit=5)}")

def compute_fuzzy_score(values, choices):
    score = 0
    for value in values:
        score += process.extractOne(value, choices)[1]
    return score / len(values)

def compute_jaro_score(values, choices):
    score = 0
    for value in values:
        score += max([jellyfish.jaro_winkler(value, choice) for choice in choices])
    return score / len(values)

for name, values in dou_enums.items():
    gdc_name = dou_groundtruth[dou_groundtruth["original_paper_variable_names"] == name]["GDC_format_variable_names"].values
    if gdc_name[0] in enums:
        choices = enums[gdc_name[0]]
    else:
        continue
    print(f"{name}: {compute_fuzzy_score(values, choices)}")


Country: 90.0
Histologic_Grade_FIGO: 86.0
Histologic_type: 91.25
Path_Stage_Primary_Tumor-pT: 90.0
Path_Stage_Reg_Lymph_Nodes-pN: 90.0
Clin_Stage_Dist_Mets-cM: 73.33333333333333
Path_Stage_Dist_Mets-pM: 58.333333333333336
tumor_Stage-Pathological: 100.0
FIGO_stage: 90.0
Race: 100.0
Ethnicity: 100.0
Gender: 100.0
Tumor_Site: 88.66666666666667
Tumor_Focality: 100.0


In [8]:
import jellyfish

values = dou_enums["Path_Stage_Primary_Tumor-pT"]
col_name = "Path_Stage_Primary_Tumor-pT"
scores = {}

for candidate, choices in enums.items():
    name_score = jellyfish.jaro_similarity(col_name, candidate)
    scores[candidate] = {
        "fuzzy": compute_fuzzy_score(values, choices),
        "name": name_score,
    }
scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1]["fuzzy"], reverse=True)}

pprint.pp(scores)


{'ajcc_clinical_t': {'fuzzy': 90.0, 'name': 0.4851851851851852},
 'ajcc_pathologic_t': {'fuzzy': 90.0, 'name': 0.5195352214960058},
 'ensat_pathologic_t': {'fuzzy': 90.0, 'name': 0.5516273849607183},
 'uicc_clinical_t': {'fuzzy': 90.0, 'name': 0.3728395061728395},
 'uicc_pathologic_t': {'fuzzy': 90.0, 'name': 0.47276688453159044},
 'ajcc_clinical_stage': {'fuzzy': 86.0, 'name': 0.49122807017543857},
 'ajcc_pathologic_stage': {'fuzzy': 86.0, 'name': 0.5608465608465608},
 'figo_stage': {'fuzzy': 86.0, 'name': 0.4962962962962963},
 'irs_group': {'fuzzy': 86.0, 'name': 0.44753086419753085},
 'uicc_clinical_stage': {'fuzzy': 86.0, 'name': 0.4474496426250812},
 'uicc_pathologic_stage': {'fuzzy': 86.0, 'name': 0.4922238255571589},
 'clark_level': {'fuzzy': 75.0, 'name': 0.4205948372615039},
 'iss_stage': {'fuzzy': 75.0, 'name': 0.5135802469135803},
 'primary_diagnosis': {'fuzzy': 66.83333333333333, 'name': 0.5195352214960058},
 'ajcc_clinical_n': {'fuzzy': 63.333333333333336, 'name': 0.439506

### Search by column name

In [1]:
from gdc_api import GDCSchema

gdc = GDCSchema("Disease")
gdc.get_gdc_candidates()

{'project::disease_type': {'score': 0.7857142857142857,
  'description': 'Full name for the project'},
 'case::disease_type': {'score': 0.7857142857142857,
  'description': "The text term used to describe the type of malignant disease, as categorized by the World Health Organization's (WHO) International Classification of Diseases for Oncology (ICD-O)."},
 'follow_up::disease_response': {'score': 0.7440476190476191,
  'description': "Code assigned to describe the patient's response or outcome to the disease."}}

In [2]:
gdc.get_properties_by_gdc_candidate('case::disease_type')
print(f"GDC Column type: {gdc.get_gdc_col_type()}")
print(f"GDC Column values: {gdc.get_gdc_col_values()}")

GDC Column type: enum
GDC Column values: ['Acinar Cell Neoplasms', 'Adenomas and Adenocarcinomas', 'Adnexal and Skin Appendage Neoplasms', 'Basal Cell Neoplasms', 'Blood Vessel Tumors', 'Chronic Myeloproliferative Disorders', 'Complex Epithelial Neoplasms', 'Complex Mixed and Stromal Neoplasms', 'Cystic, Mucinous and Serous Neoplasms', 'Ductal and Lobular Neoplasms', 'Epithelial Neoplasms, NOS', 'Fibroepithelial Neoplasms', 'Fibromatous Neoplasms', 'Germ Cell Neoplasms', 'Giant Cell Tumors', 'Gliomas', 'Granular Cell Tumors and Alveolar Soft Part Sarcomas', 'Hodgkin Lymphoma', 'Immunoproliferative Diseases', 'Leukemias, NOS', 'Lipomatous Neoplasms', 'Lymphatic Vessel Tumors', 'Lymphoid Leukemias', 'Malignant Lymphomas, NOS or Diffuse', 'Mast Cell Tumors', 'Mature B-Cell Lymphomas', 'Mature T- and NK-Cell Lymphomas', 'Meningiomas', 'Mesonephromas', 'Mesothelial Neoplasms', 'Miscellaneous Bone Tumors', 'Miscellaneous Tumors', 'Mucoepidermoid Neoplasms', 'Myelodysplastic Syndromes', 'Myel

### Search by column name and subschema name

In [3]:
from gdc_api import GDCSchema

gdc = GDCSchema("Disease", subschema="follow_up")
gdc.get_gdc_candidates()

{'follow_up::disease_response': {'score': 0.7440476190476191,
  'description': "Code assigned to describe the patient's response or outcome to the disease."}}

### Parse dataframe

In [4]:
import pandas as pd
from gdc_api import GDCSchema

dataset = pd.read_csv('../data/extracted-tables/Dou_UCEC_CPTAC3_meta_table_V2.1.csv')

gdc = GDCSchema()
gdc.parse_df(dataset)

{'idx': {'candidate': 'germline_mutation_calling_workflow::id',
  'type': 'string',
  'values': None},
 'Proteomics_Participant_ID': {'candidate': 'pathology_detail::sarcomatoid_percent',
  'type': 'number',
  'values': (0, 100)},
 'Case_excluded': {},
 'Proteomics_TMT_batch': {},
 'Proteomics_TMT_plex': {},
 'Proteomics_TMT_channel': {},
 'Proteomics_Parent_Sample_IDs': {'candidate': 'pathology_detail::sarcomatoid_present',
  'type': 'enum',
  'values': ['Yes', 'No', 'Not Reported']},
 'Proteomics_Aliquot_ID': {},
 'Proteomics_Tumor_Normal': {},
 'Proteomics_OCT': {'candidate': 'data_release::roots',
  'type': None,
  'values': None},
 'Country': {'candidate': 'analyte::amount',
  'type': 'number',
  'values': (0, inf)},
 'Histologic_Grade_FIGO': {'candidate': 'pathology_detail::histologic_progression_type',
  'type': 'enum',
  'values': ['Anaplastic',
   'Poorly differentiated',
   'Unknown',
   'Not Reported']},
 'Myometrial_invasion_Specify': {'candidate': 'pathology_detail::lympha

## Web Requests

In [None]:
import requests
import json

cases_endpt = 'https://api.gdc.cancer.gov/v0/submission/template/case?format=tsv'

# The 'fields' parameter is passed as a comma-separated string of single names.
# fields = [
#     "submitter_id",
#     "case_id",
#     "primary_site",
#     "disease_type",
#     "diagnoses.vital_status"
#     ]

# fields = ','.join(fields)

# params = {
#     "fields": fields,
#     "format": "TSV",
#     "size": "100"
#     }

response = requests.get(cases_endpt)

print(response.content)

## Python

In [1]:
from gdcdictionary.python import GDCDictionary, gdcdictionary

In [2]:
gdcdictionary.schema.keys()

dict_keys(['germline_mutation_calling_workflow', 'copy_number_estimate', 'analyte', 'pathology_detail', 'aligned_reads', 'program', 'other_clinical_attribute', 'read_group_qc', 'aliquot', 'genomic_profile_harmonization_workflow', 'copy_number_segment', 'archive', 'simple_somatic_mutation', 'filtered_copy_number_segment', 'methylation_liftover_workflow', 'protein_expression', 'project', 'case', 'pathology_report', 'root', 'somatic_annotation_workflow', 'structural_variation', 'secondary_expression_analysis', 'submitted_unaligned_reads', 'slide_image', 'data_subtype', 'submitted_tangent_copy_number', 'rna_expression_workflow', 'somatic_copy_number_workflow', 'somatic_mutation_index', 'copy_number_variation_workflow', 'clinical_supplement', 'alignment_workflow', 'gene_expression', 'treatment', 'portion', 'structural_variant_calling_workflow', 'submitted_genotyping_array', 'molecular_test', 'copy_number_liftover_workflow', 'tissue_source_site', 'methylation_array_harmonization_workflow', '

In [3]:
for key in gdcdictionary.schema.keys():
    print(gdcdictionary.schema[key]['properties'].keys())

dict_keys(['workflow_type', 'aligned_reads_files', 'submitted_genotyping_arrays', 'workflow_link', 'workflow_version', 'workflow_start_datetime', 'workflow_end_datetime', 'type', 'id', 'submitter_id', 'batch_id', 'state', 'project_id', 'created_datetime', 'updated_datetime'])
dict_keys(['data_category', 'data_type', 'data_format', 'experimental_strategy', 'platform', 'copy_number_variation_workflows', 'genomic_profile_harmonization_workflows', 'somatic_copy_number_workflows', 'file_name', 'file_size', 'md5sum', 'file_state', 'error_type', 'state_comment', 'type', 'id', 'submitter_id', 'batch_id', 'state', 'project_id', 'created_datetime', 'updated_datetime'])
dict_keys(['a260_a280_ratio', 'amount', 'analyte_quantity', 'analyte_type', 'analyte_type_id', 'analyte_volume', 'concentration', 'experimental_protocol_type', 'normal_tumor_genotype_snp_match', 'ribosomal_rna_28s_16s_ratio', 'rna_integrity_number', 'spectrophotometer_method', 'well_number', 'portions', 'samples', 'type', 'id', 's

In [5]:
gdcdictionary.schema['mirna_expression_workflow']['properties']['project_id']

{'type': 'string',
 'common': {'description': 'Unique ID for any specific defined piece of work that is undertaken or attempted to meet a single requirement.',
  'termDef': {'term': None,
   'source': None,
   'cde_id': None,
   'cde_version': None,
   'term_url': None}}}

## Read From Yaml (Silly Solution)

In [8]:
from git import Repo  # pip install gitpython

git_url = "https://github.com/NCI-GDC/gdcdictionary.git"

Repo.clone_from(git_url, "gdc/")

<git.repo.base.Repo '/home/yfw215/alpha-automl/examples/gdc/.git'>

In [50]:
import yaml
import os

gdc_dictionary_path = "gdc/src/gdcdictionary/schemas"
files = os.listdir(gdc_dictionary_path)

yamls = [ fi for fi in files if fi.endswith(".yaml") ]

with open(os.path.join(gdc_dictionary_path, 'demographic.yaml'), 'r') as file:
    prime_service = yaml.safe_load(file)
    
prime_service

{'$schema': 'http://json-schema.org/draft-04/schema#',
 'id': 'demographic',
 'title': 'Demographic',
 'type': 'object',
 'namespace': 'https://gdc.cancer.gov',
 'category': 'clinical',
 'project': '*',
 'program': '*',
 'description': 'Data for the characterization of the patient by means of segmenting the population (e.g., characterization by age, sex, or race).',
 'additionalProperties': False,
 'submittable': True,
 'downloadable': False,
 'previous_version_downloadable': False,
 'validators': None,
 'systemProperties': ['id',
  'project_id',
  'created_datetime',
  'updated_datetime',
  'state'],
 'links': [{'name': 'cases',
   'backref': 'demographics',
   'label': 'describes',
   'target_type': 'case',
   'multiplicity': 'one_to_one',
   'required': True}],
 'required': ['submitter_id', 'ethnicity', 'gender', 'race', 'vital_status'],
 'uniqueKeys': [['id'], ['project_id', 'submitter_id']],
 'deprecated': ['premature_at_birth', 'weeks_gestation_at_birth'],
 'properties': {'$ref':

In [36]:
prime_service['properties'].keys()

dict_keys(['$ref', 'age_at_index', 'age_is_obfuscated', 'cause_of_death', 'cause_of_death_source', 'country_of_birth', 'country_of_residence_at_enrollment', 'days_to_birth', 'days_to_death', 'education_level', 'ethnicity', 'gender', 'marital_status', 'occupation_duration_years', 'premature_at_birth', 'race', 'vital_status', 'weeks_gestation_at_birth', 'year_of_birth', 'year_of_death', 'cases'])

In [51]:
properties = prime_service['properties']

def process_properties(properties):
    filtered_properties = {}
    for key in properties:
        if 'type' in properties[key]:
            filtered_dict = properties[key]
            del filtered_dict['$ref']
        elif 'enum' in properties[key]:
            filtered_dict = {}
            filtered_dict['type'] = 'enum'
            filtered_dict['enum'] = properties[key]['enum']
        else:
            continue
        filtered_properties[key] = filtered_dict
    
    return filtered_properties

process_properties(properties)        

{'age_is_obfuscated': {'type': 'boolean'},
 'cause_of_death': {'type': 'enum',
  'enum': ['Cancer Related',
   'Cardiovascular Disorder, NOS',
   'End-stage Renal Disease',
   'Infection',
   'Not Cancer Related',
   'Renal Disorder, NOS',
   'Spinal Muscular Atrophy',
   'Surgical Complications',
   'Toxicity',
   'Unknown',
   'Not Reported']},
 'cause_of_death_source': {'type': 'enum',
  'enum': ['Autopsy',
   'Death Certificate',
   'Medical Record',
   'Social Security Death Index',
   'Unknown',
   'Not Reported']},
 'country_of_birth': {'type': 'enum',
  'enum': ['Afghanistan',
   'Albania',
   'Algeria',
   'Andorra',
   'Angola',
   'Anguilla',
   'Antigua and Barbuda',
   'Argentina',
   'Armenia',
   'Aruba',
   'Australia',
   'Austria',
   'Azerbaijan',
   'Bahamas',
   'Bahrain',
   'Bangladesh',
   'Barbados',
   'Belarus',
   'Belgium',
   'Belize',
   'Benin',
   'Bermuda',
   'Bhutan',
   'Bolivia',
   'Bosnia and Herzegovina',
   'Botswana',
   'Brazil',
   'Brunei',