# GDC Playground

## GDC API Usages

### Explore ENUM Sync

In [2]:
from gdc_candidate_matcher import GDCCandidateMatcher
from gdc_scoring_interface import JaroScore
import pprint

# Create a candidate matcher
matcher = GDCCandidateMatcher(subschemas=['demographic', 'diagnosis', 'sample'])

# Add a scorer
matcher.add_scorer(JaroScore)

enums = matcher.extract_enums()

pprint.pp(list(enums.keys()))

['sample::biospecimen_anatomic_site',
 'sample::biospecimen_laterality',
 'sample::composition',
 'sample::diagnosis_pathologically_confirmed',
 'sample::distance_normal_to_tumor',
 'sample::method_of_sample_procurement',
 'sample::preservation_method',
 'sample::sample_type',
 'sample::sample_type_id',
 'sample::specimen_type',
 'sample::tissue_collection_type',
 'sample::tissue_type',
 'sample::tumor_code',
 'sample::tumor_code_id',
 'sample::tumor_descriptor',
 'diagnosis::adrenal_hormone',
 'diagnosis::ajcc_clinical_m',
 'diagnosis::ajcc_clinical_n',
 'diagnosis::ajcc_clinical_stage',
 'diagnosis::ajcc_clinical_t',
 'diagnosis::ajcc_pathologic_m',
 'diagnosis::ajcc_pathologic_n',
 'diagnosis::ajcc_pathologic_stage',
 'diagnosis::ajcc_pathologic_t',
 'diagnosis::ajcc_staging_system_edition',
 'diagnosis::ann_arbor_b_symptoms',
 'diagnosis::ann_arbor_b_symptoms_described',
 'diagnosis::ann_arbor_clinical_stage',
 'diagnosis::ann_arbor_extranodal_involvement',
 'diagnosis::ann_arbor_p

In [18]:
gdc = GDCSchema()

# print(len(gdc.schema.keys()))
# print(len(set(gdc.schema.keys())))

cols = 0
for s in gdc.schema.keys():
    v = gdc.schema[s]
    cols += len(v.keys())

print(cols)

1587


In [3]:
import pandas as pd

dou_groundtruth = pd.read_csv("../data/table-matching-ground-truth/ground-truth/Dou.csv")
dou = pd.read_excel("../data/datalake/Dou.xlsx", sheet_name="UCEC_CPTAC3_meta_table_V2.1")
dou = dou[dou_groundtruth["original_paper_variable_names"].to_list()]

dou_enums = matcher.extract_df_enums(dou)
dou_enums


{'Proteomics_Participant_ID': ['c3l-00356',
  'c3n-01825',
  'c3l-00947',
  'c3l-00949',
  'c3l-00008',
  'c3n-00388',
  'c3n-00389',
  'c3l-01282',
  'c3l-01284',
  'c3n-00151',
  'c3l-00963',
  'c3l-00938',
  'nx14',
  'c3l-00006',
  'c3l-01304',
  'c3n-00377',
  'c3l-00098',
  'c3l-00767',
  'c3n-01510',
  'c3l-01275',
  'c3n-00321',
  'c3n-00323',
  'c3n-00383',
  'c3l-00137',
  'c3n-00334',
  'c3n-01349',
  'nx18',
  'c3l-00586',
  'nx7',
  'c3n-01212',
  'nx8',
  'nx10',
  'c3n-00847',
  'c3l-00449',
  'c3l-00156',
  'c3n-00322',
  'c3l-00145',
  'nx5',
  'c3n-00333',
  'nx12',
  'c3l-01249',
  'c3n-01825_replication',
  'c3l-00942',
  'c3l-00780',
  'c3l-00946',
  'c3n-00850',
  'c3n-00337',
  'c3l-00781',
  'nx3',
  'nx16',
  'c3l-00932',
  'c3n-01001',
  'c3n-00379',
  'nx6',
  'c3l-00921',
  'c3l-00770',
  'c3n-00340',
  'c3l-00090',
  'c3l-01744',
  'c3n-00324',
  'c3n-01537',
  'c3l-00961',
  'c3l-01252',
  'c3l-01247',
  'c3n-00743',
  'c3l-00918',
  'c3l-01246',
  'c3n-01

### Compute the Embedded Score (Jaro on both unique values and column names)

In [4]:
embedded_scores = matcher.compute_embedded_col_values_name_score(dou_enums)
pprint.pp(embedded_scores["Path_Stage_Primary_Tumor-pT"])

{'diagnosis::ajcc_pathologic_t': {'jaro-values': 0.7291157916157917,
                                  'jaro-name': 0.47583774250440913},
 'diagnosis::uicc_pathologic_t': {'jaro-values': 0.7291157916157917,
                                  'jaro-name': 0.44250440917107586},
 'diagnosis::ajcc_clinical_t': {'jaro-values': 0.7291157916157917,
                                'jaro-name': 0.41425858092524753},
 'diagnosis::uicc_clinical_t': {'jaro-values': 0.7291157916157917,
                                'jaro-name': 0.36663953330619997},
 'diagnosis::ensat_pathologic_t': {'jaro-values': 0.7142348392348392,
                                   'jaro-name': 0.508301404853129},
 'diagnosis::tumor_regression_grade': {'jaro-values': 0.690450752950753,
                                       'jaro-name': 0.5047298380631714},
 'diagnosis::weiss_assessment_score': {'jaro-values': 0.690450752950753,
                                       'jaro-name': 0.42446689113355784},
 'diagnosis::irs_stage': 

In [5]:
matcher.parse_df(dou)

{'Proteomics_Participant_ID': [{'candidate': 'diagnosis::child_pugh_classification',
   'type': 'enum',
   'values': ['A',
    'A5',
    'A6',
    'B',
    'B7',
    'B8',
    'B9',
    'C',
    'C10',
    'C11',
    'C12',
    'Unknown',
    'Not Reported'],
   'jaro-values': 0.6812061771411374,
   'jaro-name': 0.5067724867724868}],
 'Country': [{'candidate': 'demographic::country_of_birth',
   'type': 'enum',
   'values': ['Afghanistan',
    'Albania',
    'Algeria',
    'Andorra',
    'Angola',
    'Anguilla',
    'Antigua and Barbuda',
    'Argentina',
    'Armenia',
    'Aruba',
    'Australia',
    'Austria',
    'Azerbaijan',
    'Bahamas',
    'Bahrain',
    'Bangladesh',
    'Barbados',
    'Belarus',
    'Belgium',
    'Belize',
    'Benin',
    'Bermuda',
    'Bhutan',
    'Bolivia',
    'Bosnia and Herzegovina',
    'Botswana',
    'Brazil',
    'Brunei',
    'Bulgaria',
    'Burkina Faso',
    'Burundi',
    'Cambodia',
    'Cameroon',
    'Canada',
    'Cape Verde',
    '

In [6]:
for key, scores in embedded_scores.items():
    print(f"{key}: {list(scores.keys())[0]}")

Proteomics_Participant_ID: diagnosis::child_pugh_classification
Country: demographic::country_of_birth
Histologic_Grade_FIGO: diagnosis::tumor_grade
Histologic_type: diagnosis::primary_diagnosis
Path_Stage_Primary_Tumor-pT: diagnosis::ajcc_pathologic_t
Path_Stage_Reg_Lymph_Nodes-pN: diagnosis::primary_diagnosis
Clin_Stage_Dist_Mets-cM: diagnosis::child_pugh_classification
Path_Stage_Dist_Mets-pM: diagnosis::primary_diagnosis
tumor_Stage-Pathological: diagnosis::ensat_pathologic_stage
FIGO_stage: diagnosis::clark_level
BMI: diagnosis::weiss_assessment_score
Age: diagnosis::weiss_assessment_score
Race: demographic::race
Ethnicity: demographic::ethnicity
Gender: demographic::gender
Tumor_Site: diagnosis::tissue_or_organ_of_origin
Tumor_Focality: diagnosis::tumor_focality
Tumor_Size_cm: diagnosis::weiss_assessment_score


### FuzzyWuzzy score (alternative)

In [7]:
from fuzzywuzzy import process

# choices = enums["ajcc_pathologic_t"]
# for value in dou_enums["Path_Stage_Primary_Tumor-pT"]:
#     print(f"{value}: {process.extract(value, choices, limit=5)}")

def compute_fuzzy_score(values, choices):
    score = 0
    for value in values:
        score += process.extractOne(value, choices)[1]
    return score / len(values)

for name, values in dou_enums.items():
    gdc_name = dou_groundtruth[dou_groundtruth["original_paper_variable_names"] == name]["GDC_format_variable_names"].values
    if gdc_name[0] in enums:
        choices = enums[gdc_name[0]]
    else:
        continue
    print(f"{name}: {compute_fuzzy_score(values, choices)}")


### Search by column name

In [8]:
from gdc_api import GDCSchema

gdc = GDCSchema("Disease")
gdc.get_gdc_candidates()

{'project::disease_type': {'score': 0.7857142857142857,
  'description': 'Full name for the project'},
 'case::disease_type': {'score': 0.7857142857142857,
  'description': "The text term used to describe the type of malignant disease, as categorized by the World Health Organization's (WHO) International Classification of Diseases for Oncology (ICD-O)."},
 'follow_up::disease_response': {'score': 0.7440476190476191,
  'description': "Code assigned to describe the patient's response or outcome to the disease."}}

In [9]:
gdc.get_properties_by_gdc_candidate('case::disease_type')
print(f"GDC Column type: {gdc.get_gdc_col_type()}")
print(f"GDC Column values: {gdc.get_gdc_col_values()}")

GDC Column type: enum
GDC Column values: ['Acinar Cell Neoplasms', 'Adenomas and Adenocarcinomas', 'Adnexal and Skin Appendage Neoplasms', 'Basal Cell Neoplasms', 'Blood Vessel Tumors', 'Chronic Myeloproliferative Disorders', 'Complex Epithelial Neoplasms', 'Complex Mixed and Stromal Neoplasms', 'Cystic, Mucinous and Serous Neoplasms', 'Ductal and Lobular Neoplasms', 'Epithelial Neoplasms, NOS', 'Fibroepithelial Neoplasms', 'Fibromatous Neoplasms', 'Germ Cell Neoplasms', 'Giant Cell Tumors', 'Gliomas', 'Granular Cell Tumors and Alveolar Soft Part Sarcomas', 'Hodgkin Lymphoma', 'Immunoproliferative Diseases', 'Leukemias, NOS', 'Lipomatous Neoplasms', 'Lymphatic Vessel Tumors', 'Lymphoid Leukemias', 'Malignant Lymphomas, NOS or Diffuse', 'Mast Cell Tumors', 'Mature B-Cell Lymphomas', 'Mature T- and NK-Cell Lymphomas', 'Meningiomas', 'Mesonephromas', 'Mesothelial Neoplasms', 'Miscellaneous Bone Tumors', 'Miscellaneous Tumors', 'Mucoepidermoid Neoplasms', 'Myelodysplastic Syndromes', 'Myel

### Search by column name and subschema name

In [10]:
from gdc_api import GDCSchema

gdc = GDCSchema("Disease", subschema="follow_up")
gdc.get_gdc_candidates()

{'follow_up::disease_response': {'score': 0.7440476190476191,
  'description': "Code assigned to describe the patient's response or outcome to the disease."}}

### Parse dataframe

In [11]:
import pandas as pd
from gdc_api import GDCSchema

dataset = pd.read_csv('../data/extracted-tables/Dou_UCEC_CPTAC3_meta_table_V2.1.csv')

gdc = GDCSchema()
gdc.parse_df(dataset)

AttributeError: 'GDCSchema' object has no attribute 'parse_df'

## Web Requests

In [None]:
import requests
import json

cases_endpt = 'https://api.gdc.cancer.gov/v0/submission/template/case?format=tsv'

# The 'fields' parameter is passed as a comma-separated string of single names.
# fields = [
#     "submitter_id",
#     "case_id",
#     "primary_site",
#     "disease_type",
#     "diagnoses.vital_status"
#     ]

# fields = ','.join(fields)

# params = {
#     "fields": fields,
#     "format": "TSV",
#     "size": "100"
#     }

response = requests.get(cases_endpt)

print(response.content)

## Python

In [None]:
from gdcdictionary.python import GDCDictionary, gdcdictionary

In [None]:
gdcdictionary.schema.keys()

dict_keys(['germline_mutation_calling_workflow', 'copy_number_estimate', 'analyte', 'pathology_detail', 'aligned_reads', 'program', 'other_clinical_attribute', 'read_group_qc', 'aliquot', 'genomic_profile_harmonization_workflow', 'copy_number_segment', 'archive', 'simple_somatic_mutation', 'filtered_copy_number_segment', 'methylation_liftover_workflow', 'protein_expression', 'project', 'case', 'pathology_report', 'root', 'somatic_annotation_workflow', 'structural_variation', 'secondary_expression_analysis', 'submitted_unaligned_reads', 'slide_image', 'data_subtype', 'submitted_tangent_copy_number', 'rna_expression_workflow', 'somatic_copy_number_workflow', 'somatic_mutation_index', 'copy_number_variation_workflow', 'clinical_supplement', 'alignment_workflow', 'gene_expression', 'treatment', 'portion', 'structural_variant_calling_workflow', 'submitted_genotyping_array', 'molecular_test', 'copy_number_liftover_workflow', 'tissue_source_site', 'methylation_array_harmonization_workflow', '

In [None]:
for key in gdcdictionary.schema.keys():
    print(gdcdictionary.schema[key]['properties'].keys())

dict_keys(['workflow_type', 'aligned_reads_files', 'submitted_genotyping_arrays', 'workflow_link', 'workflow_version', 'workflow_start_datetime', 'workflow_end_datetime', 'type', 'id', 'submitter_id', 'batch_id', 'state', 'project_id', 'created_datetime', 'updated_datetime'])
dict_keys(['data_category', 'data_type', 'data_format', 'experimental_strategy', 'platform', 'copy_number_variation_workflows', 'genomic_profile_harmonization_workflows', 'somatic_copy_number_workflows', 'file_name', 'file_size', 'md5sum', 'file_state', 'error_type', 'state_comment', 'type', 'id', 'submitter_id', 'batch_id', 'state', 'project_id', 'created_datetime', 'updated_datetime'])
dict_keys(['a260_a280_ratio', 'amount', 'analyte_quantity', 'analyte_type', 'analyte_type_id', 'analyte_volume', 'concentration', 'experimental_protocol_type', 'normal_tumor_genotype_snp_match', 'ribosomal_rna_28s_16s_ratio', 'rna_integrity_number', 'spectrophotometer_method', 'well_number', 'portions', 'samples', 'type', 'id', 's

In [None]:
gdcdictionary.schema['mirna_expression_workflow']['properties']['project_id']

{'type': 'string',
 'common': {'description': 'Unique ID for any specific defined piece of work that is undertaken or attempted to meet a single requirement.',
  'termDef': {'term': None,
   'source': None,
   'cde_id': None,
   'cde_version': None,
   'term_url': None}}}

## Read From Yaml (Silly Solution)

In [None]:
from git import Repo  # pip install gitpython

git_url = "https://github.com/NCI-GDC/gdcdictionary.git"

Repo.clone_from(git_url, "gdc/")

<git.repo.base.Repo '/home/yfw215/alpha-automl/examples/gdc/.git'>

In [None]:
import yaml
import os

gdc_dictionary_path = "gdc/src/gdcdictionary/schemas"
files = os.listdir(gdc_dictionary_path)

yamls = [ fi for fi in files if fi.endswith(".yaml") ]

with open(os.path.join(gdc_dictionary_path, 'demographic.yaml'), 'r') as file:
    prime_service = yaml.safe_load(file)
    
prime_service

{'$schema': 'http://json-schema.org/draft-04/schema#',
 'id': 'demographic',
 'title': 'Demographic',
 'type': 'object',
 'namespace': 'https://gdc.cancer.gov',
 'category': 'clinical',
 'project': '*',
 'program': '*',
 'description': 'Data for the characterization of the patient by means of segmenting the population (e.g., characterization by age, sex, or race).',
 'additionalProperties': False,
 'submittable': True,
 'downloadable': False,
 'previous_version_downloadable': False,
 'validators': None,
 'systemProperties': ['id',
  'project_id',
  'created_datetime',
  'updated_datetime',
  'state'],
 'links': [{'name': 'cases',
   'backref': 'demographics',
   'label': 'describes',
   'target_type': 'case',
   'multiplicity': 'one_to_one',
   'required': True}],
 'required': ['submitter_id', 'ethnicity', 'gender', 'race', 'vital_status'],
 'uniqueKeys': [['id'], ['project_id', 'submitter_id']],
 'deprecated': ['premature_at_birth', 'weeks_gestation_at_birth'],
 'properties': {'$ref':

In [None]:
prime_service['properties'].keys()

dict_keys(['$ref', 'age_at_index', 'age_is_obfuscated', 'cause_of_death', 'cause_of_death_source', 'country_of_birth', 'country_of_residence_at_enrollment', 'days_to_birth', 'days_to_death', 'education_level', 'ethnicity', 'gender', 'marital_status', 'occupation_duration_years', 'premature_at_birth', 'race', 'vital_status', 'weeks_gestation_at_birth', 'year_of_birth', 'year_of_death', 'cases'])

In [None]:
properties = prime_service['properties']

def process_properties(properties):
    filtered_properties = {}
    for key in properties:
        if 'type' in properties[key]:
            filtered_dict = properties[key]
            del filtered_dict['$ref']
        elif 'enum' in properties[key]:
            filtered_dict = {}
            filtered_dict['type'] = 'enum'
            filtered_dict['enum'] = properties[key]['enum']
        else:
            continue
        filtered_properties[key] = filtered_dict
    
    return filtered_properties

process_properties(properties)        

{'age_is_obfuscated': {'type': 'boolean'},
 'cause_of_death': {'type': 'enum',
  'enum': ['Cancer Related',
   'Cardiovascular Disorder, NOS',
   'End-stage Renal Disease',
   'Infection',
   'Not Cancer Related',
   'Renal Disorder, NOS',
   'Spinal Muscular Atrophy',
   'Surgical Complications',
   'Toxicity',
   'Unknown',
   'Not Reported']},
 'cause_of_death_source': {'type': 'enum',
  'enum': ['Autopsy',
   'Death Certificate',
   'Medical Record',
   'Social Security Death Index',
   'Unknown',
   'Not Reported']},
 'country_of_birth': {'type': 'enum',
  'enum': ['Afghanistan',
   'Albania',
   'Algeria',
   'Andorra',
   'Angola',
   'Anguilla',
   'Antigua and Barbuda',
   'Argentina',
   'Armenia',
   'Aruba',
   'Australia',
   'Austria',
   'Azerbaijan',
   'Bahamas',
   'Bahrain',
   'Bangladesh',
   'Barbados',
   'Belarus',
   'Belgium',
   'Belize',
   'Benin',
   'Bermuda',
   'Bhutan',
   'Bolivia',
   'Bosnia and Herzegovina',
   'Botswana',
   'Brazil',
   'Brunei',