In [1]:
import sys
sys.path.append('../')

from gdc.gdc_api import GDCSchema

In [2]:
gdc_schema = GDCSchema()

In [3]:
import csv
import json
import os
import random

with open('../gdc/gdc_schema.json', 'r') as f:
    schema_data = json.load(f)

### Extract distinct GDC variable names

In [4]:
def extract_properties(schema_data, sample_size=10):
    properties_info = []
    all_properties = {}
    for schema_key, schema_value in schema_data.items():
        if 'properties' in schema_value:
            for prop, attributes in schema_value['properties'].items():
                if prop not in all_properties:
                    all_properties[prop] = {'types': set(), 'values': set()}
                prop_type = attributes.get('type')
                if prop_type:
                    if isinstance(prop_type, list):
                        all_properties[prop]['types'].update(prop_type)
                    else:
                        all_properties[prop]['types'].add(prop_type)
                prop_values = attributes.get('enum')
                if prop_values:
                    all_properties[prop]['values'].update(prop_values)
                elif 'oneOf' in attributes:
                    for option in attributes['oneOf']:
                        if 'enum' in option:
                            all_properties[prop]['values'].update(option['enum'])

    for prop in sorted(all_properties):
        values = list(all_properties[prop]['values'])
        if len(values) > sample_size:
            values = random.sample(values, sample_size)
        values = ', '.join(sorted(set(values))) 

        types = ', '.join(sorted(all_properties[prop]['types']))

        properties_info.append({
            'column name': prop,
            'column type': types if types else 'Not specified',
            'column values': values if values else 'Not specified'
        })

    return properties_info

In [5]:
properties_info = extract_properties(schema_data)
properties_info

[{'column name': 'a260_a280_ratio',
  'column type': 'number',
  'column values': 'Not specified'},
 {'column name': 'aa_change',
  'column type': 'string',
  'column values': 'Not specified'},
 {'column name': 'adapter_content',
  'column type': 'Not specified',
  'column values': 'FAIL, Not Reported, PASS, Unknown, WARN'},
 {'column name': 'adapter_name',
  'column type': 'string',
  'column values': 'Not specified'},
 {'column name': 'adapter_sequence',
  'column type': 'string',
  'column values': 'Not specified'},
 {'column name': 'additional_pathology_findings',
  'column type': 'Not specified',
  'column values': 'Bone marrow discordant histology, Hyperkeratosis, Inflammation, Keratinizing dysplasia; mild, Leiomyoma, Other Complex Extravascular Matrix Patterns, PD-L1 CPS (223C LDT) - 20%, Platinum-resistant, Sialadenitis, Tumor has rough spikey edges'},
 {'column name': 'adrenal_hormone',
  'column type': 'Not specified',
  'column values': 'No, Not Reported, Yes'},
 {'column na

In [6]:
csv_file_path = 'gdc_schema_extracted.csv'
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['column name', 'column type', 'column values'])
    writer.writeheader()
    for property_info in properties_info:
        writer.writerow(property_info)

### Missing values in ground truth

In [7]:
ground_truth_dir = '../data/table-matching-ground-truth/ground-truth'

def read_gdc_format_variable_names(csv_path):
    with open(csv_path, mode='r') as file:
        reader = csv.DictReader(file)
        return [row['GDC_format_variable_names'] for row in reader]

extracted_variable_names = {row['column name'] for row in properties_info}

all_gdc_format_variable_names = set()

for csv_file in os.listdir(ground_truth_dir):
    if csv_file.endswith('.csv'):
        csv_path = os.path.join(ground_truth_dir, csv_file)
        gdc_names = read_gdc_format_variable_names(csv_path)
        all_gdc_format_variable_names.update(gdc_names)

missing_names = all_gdc_format_variable_names - extracted_variable_names
print('Missing GDC format variable names:', missing_names)


Missing GDC format variable names: {'vascular_invasion_present; lymphatic_invasion_present', 'case_submitter_id', 'Case_ID', 'site_of_resection_or_biopsy;tissue_or_organ_of_origin', 'site_of_resection_or_biopsy; tissue_or_organ_of_origin', 'can be inferred from tobacco_smoking_status'}
