In [170]:
import pandas as pd
import seaborn as sns
sns.set_theme(style="white")

from valentine.algorithms import Coma
from valentine.algorithms import JaccardDistanceMatcher
from valentine.algorithms import SimilarityFlooding
from valentine.algorithms import DistributionBased
from valentine.algorithms import Cupid
from valentine.metrics import F1Score, PrecisionTopNPercent
from valentine import valentine_match
import pprint

import os
import sys
import jellyfish

sys.path.append(os.path.join(os.path.abspath(''), '..'))
from gdc.gdc_api import GDCSchema

pp = pprint.PrettyPrinter(indent=4, sort_dicts=True)


In [171]:
df_target = pd.read_csv('../data/target.csv')
df_target.drop('study', axis=1, inplace=True)
print(df_target.shape)
df_target.head(5)

(1068, 16)


Unnamed: 0,case_submitter_id,age_at_diagnosis,race,ethnicity,gender,vital_status,ajcc_pathologic_t,ajcc_pathologic_n,ajcc_pathologic_stage,tumor_grade,tumor_focality,tumor_largest_dimension_diameter,primary_diagnosis,morphology,tissue_or_organ_of_origin,tumor_code
0,01BR001,20089.0,black or african american,not hispanic or latino,female,Alive,T2,N1c,Stage II,GX,Not Reported,Not Reported,Invasive carcinoma of no special type,8500/3,"Breast, NOS",BRCA
1,01BR008,17532.0,black or african american,not hispanic or latino,female,Not Reported,Not Reported,Not Reported,Not Reported,GX,Not Reported,Not Reported,Not Reported,Not Reported,"Breast, NOS",BRCA
2,01BR009,23376.0,black or african american,not hispanic or latino,female,Not Reported,Not Reported,Not Reported,Not Reported,GX,Not Reported,Not Reported,Not Reported,Not Reported,"Breast, NOS",BRCA
3,01BR010,23741.0,black or african american,not hispanic or latino,female,Not Reported,Not Reported,Not Reported,Not Reported,GX,Not Reported,Not Reported,Not Reported,Not Reported,"Breast, NOS",BRCA
4,01BR015,12784.0,white,not hispanic or latino,female,Alive,T2,N1,Stage II,GX,Not Reported,Not Reported,Invasive carcinoma of no special type,8500/3,"Breast, NOS",BRCA


### Check GDC Dictionary for matches

In [172]:
gdc_schema = GDCSchema()
gdc_matches = gdc_schema.parse_df(df_target)

gdc_matches_values = {entry: entry_value['candidate'] for entry, entry_value in gdc_matches.items()}
#print(f'Number of matches: {len(gdc_matches_values)} out of {len(df_target.columns)} columns in target')
gdc_matches_values

{'case_submitter_id': 'germline_mutation_calling_workflow::submitter_id',
 'age_at_diagnosis': 'clinical::age_at_diagnosis',
 'race': 'clinical::race',
 'ethnicity': 'clinical::ethnicity',
 'gender': 'clinical::gender',
 'vital_status': 'clinical::vital_status',
 'ajcc_pathologic_t': 'diagnosis::ajcc_pathologic_t',
 'ajcc_pathologic_n': 'diagnosis::ajcc_pathologic_n',
 'ajcc_pathologic_stage': 'diagnosis::ajcc_pathologic_stage',
 'tumor_grade': 'diagnosis::tumor_grade',
 'tumor_focality': 'diagnosis::tumor_focality',
 'tumor_largest_dimension_diameter': 'pathology_detail::tumor_largest_dimension_diameter',
 'primary_diagnosis': 'diagnosis::primary_diagnosis',
 'morphology': 'diagnosis::morphology',
 'tissue_or_organ_of_origin': 'diagnosis::tissue_or_organ_of_origin',
 'tumor_code': 'sample::tumor_code'}

In [181]:
for col, match in gdc_matches.items():
    if isinstance(match['values'], list):
        col_domain = set(df_target[col].unique())
        col_domain = {value.lower() for value in col_domain}
        gdc_domain = set(match['values'])
        gdc_domain = {value.lower() for value in gdc_domain}
        
        print(col, gdc_domain)

        ratio_col2GDC = len(col_domain.intersection(gdc_domain)) / len(col_domain)
        ration_GDC2col = len(col_domain.intersection(gdc_domain)) / len(gdc_domain)
        print('Ratio of column values in GDC:', ratio_col2GDC)
        print('Ratio GDC values in column values :', ration_GDC2col)
        if ratio_col2GDC != 1.0:
            print('Column values that are not in GDC:', col_domain.difference(gdc_domain))
        
    else:
        print('No domain enumeration for',col,' with type', match['type'])
    print('-----------------')
            
    

No domain enumeration for case_submitter_id  with type string
-----------------
No domain enumeration for age_at_diagnosis  with type number
-----------------
race {'american indian or alaska native', 'black or african american', 'not reported', 'other', 'asian', 'native hawaiian or other pacific islander', 'white'}
Ratio of column values in GDC: 0.8
Ratio GDC values in column values : 0.5714285714285714
Column values that are not in GDC: {'Not Reported'}
-----------------
ethnicity {'not hispanic or latino', 'hispanic or latino'}
Ratio of column values in GDC: 0.6666666666666666
Ratio GDC values in column values : 1.0
Column values that are not in GDC: {'Not Reported'}
-----------------
gender {'unknown', 'unspecified', 'male', 'female'}
Ratio of column values in GDC: 1.0
Ratio GDC values in column values : 0.5
-----------------
vital_status {'alive', 'dead', 'lost to follow-up'}
Ratio of column values in GDC: 0.0
Ratio GDC values in column values : 0.0
Column values that are not in G

In [None]:
df_liu = pd.read_csv('../data/liuData_Supplementary_Table_1.csv')
df_liu_dictionary = pd.read_csv('../data/liuData_Data_dictionary.csv')
print(df_liu.shape)
df_liu.head(5)

(1185, 83)


Unnamed: 0,Case_ID,tumor_code,tumor_sample_id_protein,normal_sample_id_protein,is_excluded_from_pancancer_studies,reason_for_exclusion,is_gtex,specimen/aliquout_id_protein_normal,specimen/aliquout_id_protein_tumor,specimen/aliquout_id_RNA_Tumor,...,follow-up/additional_treatment_radiation_therapy_for_new_tumor,follow-up/additional_treatment_pharmaceutical_therapy_for_new_tumor,follow-up/additional_treatment_immuno_for_new_tumor,follow-up/number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor_event_loco-regional,follow-up/number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor_event_metastasis,"Recurrence-free survival, days","Recurrence status (1, yes; 0, no)","Overall survival, days","Survival status (1, dead; 0, alive)","Time between collection and diagnosis, days"
0,11BR047,BRCA,11BR047-T,,,,0.0,,81116212-b7e6-454b-9579-105cf3,ef52c640-13a9-4855-9ce2-0be77a_D7_1,...,,,,,,,0.0,369.0,0.0,
1,11BR043,BRCA,11BR043-T,,,,0.0,,6d34d499-167e-42aa-9790-316fca_D2,6d34d499-167e-42aa-9790-316fca_D7_1,...,,,,,,,0.0,374.0,0.0,
2,11BR049,BRCA,11BR049-T,,,,0.0,,2e700669-85b0-43fa-a9c7-3eaf5a_D2,2e700669-85b0-43fa-a9c7-3eaf5a_D1,...,,,,,,,0.0,379.0,0.0,
3,11BR023,BRCA,11BR023-T,,,,0.0,,0a80d3c4-0758-447a-958c-ea868c,079b5600-6afc-4785-bb22-48cfab_D7_1,...,,,,,,,0.0,398.0,0.0,
4,18BR010,BRCA,18BR010-T,,,,0.0,,0bb9d596-774e-452b-9c89-a6643c_D2,0bb9d596-774e-452b-9c89-a6643c_D7_1,...,,,,,,,0.0,410.0,0.0,


In [None]:
threshold = len(df_liu) * 0.6
print(threshold)
df_liu.dropna(thresh=threshold, axis=1, inplace=True) # remove columns with many (+50%) nulls
print(df_liu.shape)
df_liu.head(10)


711.0
(1185, 45)


Unnamed: 0,Case_ID,tumor_code,tumor_sample_id_protein,is_gtex,specimen/aliquout_id_protein_tumor,specimen/aliquout_id_RNA_Tumor,specimen/aliquout_id_DNA_Tumor,specimen/aliquout_id_DNA_Blood,Age,Sex,...,follow-up/number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_last_contact,follow-up/adjuvant_post-operative_radiation_therapy,follow-up/adjuvant_post-operative_pharmaceutical_therapy,follow-up/adjuvant_post-operative_immunological_therapy,follow-up/measure_of_success_of_outcome_at_the_completion_of_initial_first_course_treatment,follow-up/new_tumor_after_initial_treatment,"Recurrence status (1, yes; 0, no)","Overall survival, days","Survival status (1, dead; 0, alive)","Time between collection and diagnosis, days"
0,11BR047,BRCA,11BR047-T,0.0,81116212-b7e6-454b-9579-105cf3,ef52c640-13a9-4855-9ce2-0be77a_D7_1,ef52c640-13a9-4855-9ce2-0be77a_D6_1,0bfd3ae8-d275-4dab-9360-d1df30_D1_1,59,Female,...,369.0,Not Applicable,Not Applicable,,,Not Applicable,0.0,369.0,0.0,
1,11BR043,BRCA,11BR043-T,0.0,6d34d499-167e-42aa-9790-316fca_D2,6d34d499-167e-42aa-9790-316fca_D7_1,6d34d499-167e-42aa-9790-316fca_D6_1,a1338e3d-598a-45b5-838f-9b6206_D1_1,36,Female,...,374.0,Not Applicable,Not Applicable,,,Not Applicable,0.0,374.0,0.0,
2,11BR049,BRCA,11BR049-T,0.0,2e700669-85b0-43fa-a9c7-3eaf5a_D2,2e700669-85b0-43fa-a9c7-3eaf5a_D1,2e700669-85b0-43fa-a9c7-3eaf5a_D6,ad85fadf-31b7-44f3-b6ea-73c55d_D1,45,Female,...,379.0,Not Applicable,Not Applicable,,,Not Applicable,0.0,379.0,0.0,
3,11BR023,BRCA,11BR023-T,0.0,0a80d3c4-0758-447a-958c-ea868c,079b5600-6afc-4785-bb22-48cfab_D7_1,079b5600-6afc-4785-bb22-48cfab_D6_1,70ee4720-f4dc-4b91-a6aa-c0efd9_D1_1,58,Female,...,398.0,Not Applicable,Not Applicable,,,Not Applicable,0.0,398.0,0.0,
4,18BR010,BRCA,18BR010-T,0.0,0bb9d596-774e-452b-9c89-a6643c_D2,0bb9d596-774e-452b-9c89-a6643c_D7_1,0bb9d596-774e-452b-9c89-a6643c_D6_1,feab8720-9694-4679-917f-f1287c_D1_1,41,Female,...,410.0,Not Applicable,Not Applicable,,,Not Applicable,0.0,410.0,0.0,
5,06BR003,BRCA,06BR003-T,0.0,31b13596-554e-452f-91a4-7e67a8_D2,"['31b13596-554e-452f-91a4-7e67a8_D7', 'not run']","['31b13596-554e-452f-91a4-7e67a8_D6', 'not run']","['31da4db6-8404-43f0-995d-ebc10b_D1', 'not run']",75,Female,...,323.0,Not Applicable,Not Applicable,,,Not Applicable,0.0,323.0,0.0,
6,11BR074,BRCA,11BR074-T,0.0,3367406e-d39c-4641-a3e7-44e1f3,12ce8313-6c54-4bee-a996-1aa7f8_D7,12ce8313-6c54-4bee-a996-1aa7f8_D6_1,93f961e1-fe4c-4dfc-8cbe-1dff31_D1_1,66,Female,...,367.0,Not Applicable,Not Applicable,,,Not Applicable,0.0,367.0,0.0,
7,18BR017,BRCA,18BR017-T,0.0,392b8aa4-9f99-4250-b693-326260_D2,392b8aa4-9f99-4250-b693-326260_D7_1,392b8aa4-9f99-4250-b693-326260_D6_1,f49705bd-129b-4bfe-a944-a7eb51_D1_1,72,Female,...,445.0,Not Applicable,Not Applicable,,,Not Applicable,0.0,445.0,0.0,
8,01BR017,BRCA,01BR017-T,0.0,09659708-7747-4d59-a3b9-e221e0_D2,09659708-7747-4d59-a3b9-e221e0_D7,09659708-7747-4d59-a3b9-e221e0_D6,b76c68fc-2403-4e79-ad46-4ae336_D1,45,Female,...,413.0,Yes,Yes,,,No,0.0,413.0,0.0,
9,06BR006,BRCA,06BR006-T,0.0,3799eeb5-c966-4059-a78f-ace269_D2,3799eeb5-c966-4059-a78f-ace269_D7,3799eeb5-c966-4059-a78f-ace269_D6,7e168c2a-f22a-44d9-ac7d-1a4c42_D1,51,Female,...,137.0,No,Yes,,,Yes,1.0,137.0,0.0,


### Find and Analyze Keys of target and GDC

In [None]:
def find_dataframe_uniques(df):
    unique_candidates = []
    for col in df.columns:
        if df[col].nunique()/ len(df) > 0.98:
            unique_candidates.append(col)
    return unique_candidates

print(find_dataframe_uniques(df_target))
print(find_dataframe_uniques(df_liu))



['case_submitter_id']
['Case_ID']


### Find the rows in liu that were not included in GDC_formatted

In [None]:
common_values = df_target['case_submitter_id'].isin(df_liu['Case_ID']).sum()
total_values = len(df_target['case_submitter_id'])
percentage = (common_values / total_values) * 100
print(f'Percentage of comumn values between gdc_formatted and liu_et_all {percentage}')

not_in_target = df_liu[~df_liu['Case_ID'].isin(df_target['case_submitter_id'])]['Case_ID']
print(f'Values in liu_et_all not in gdc_formatted')
# print(not_in_target)
subset_not_in_target = df_liu[~df_liu['Case_ID'].isin(df_target['case_submitter_id'])]
print(subset_not_in_target)



Percentage of comumn values between gdc_formatted and liu_et_all 100.0
Values in liu_et_all not in gdc_formatted
      Case_ID tumor_code tumor_sample_id_protein  is_gtex  \
75    11BR057       BRCA               11BR057-T      0.0   
97    11BR078       BRCA               11BR078-T      0.0   
100   11BR076       BRCA               11BR076-T      0.0   
123       604       BRCA                     NaN      0.0   
124      1488       BRCA                     NaN      0.0   
...       ...        ...                     ...      ...   
1180     NX10       UCEC                     NaN      0.0   
1181     NX16       UCEC                     NaN      0.0   
1182     NX18       UCEC                     NaN      0.0   
1183     NX11       UCEC                     NaN      0.0   
1184     NX15       UCEC                     NaN      0.0   

     specimen/aliquout_id_protein_tumor specimen/aliquout_id_RNA_Tumor  \
75    8886aaca-adcf-4416-920a-794bf0_D2                            NaN   
97    

### Find rows in liu with many nulls

In [None]:
threshold = len(df_liu.columns) * 0.7
rows_with_nulls = df_liu[df_liu.isnull().sum(axis=1) > threshold]
print(f'Rows with nulls in more than {threshold} columns')
print(rows_with_nulls)

Rows with nulls in more than 31.499999999999996 columns
         Case_ID tumor_code tumor_sample_id_protein  is_gtex  \
97       11BR078       BRCA               11BR078-T      0.0   
124         1488       BRCA                     NaN      0.0   
125    100002921       BRCA                     NaN      0.0   
126    100003304       BRCA                     NaN      0.0   
127    100004012       BRCA                     NaN      0.0   
128    100004028       BRCA                     NaN      0.0   
132      05BR051       BRCA                     NaN      0.0   
133      05BR052       BRCA                     NaN      0.0   
134      05BR055       BRCA                     NaN      0.0   
135      05BR058       BRCA                     NaN      0.0   
137      18BR008       BRCA                     NaN      0.0   
351      05CO014       COAD                     NaN      0.0   
358      PT-NPJ7        GBM                     NaN      1.0   
359      PT-P44H        GBM                     

## Computes the matching between GDC_compiled file and Liu_data

In [None]:
# matcher = Coma()
matcher = Coma(use_instances=True)
# matcher = JaccardDistanceMatcher()
# matcher = SimilarityFlooding()
# matcher = DistributionBased()
# matcher = Cupid()

print(f"Computing the matches using the {type(matcher).__name__} algorithm...")

matches = valentine_match(df_liu, df_target, matcher)


Computing the matches using the Coma algorithm...


In [None]:
print("Found the following matches:")
pp.pprint(matches)

print("\nOne-to-one matches:")
pp.pprint(matches.one_to_one())


Found the following matches:
{   (('table_1', 'Case_ID'), ('table_2', 'case_submitter_id')): 0.5909878,
    (('table_1', 'Ethnicity'), ('table_2', 'ethnicity')): 0.73032486,
    (('table_1', 'Race'), ('table_2', 'race')): 0.721171,
    (('table_1', 'Sex'), ('table_2', 'gender')): 0.3429932,
    (('table_1', 'baseline/margin_status'), ('table_2', 'vital_status')): 0.30746195,
    (('table_1', 'baseline/number_of_lymph_nodes_positive_for_tumor_by_he_staining'), ('table_2', 'tumor_largest_dimension_diameter')): 0.27622205,
    (('table_1', 'baseline/pathologic_staging_primary_tumor_pt'), ('table_2', 'ajcc_pathologic_t')): 0.36558565,
    (('table_1', 'baseline/pathologic_staging_regional_lymph_nodes_pn'), ('table_2', 'ajcc_pathologic_n')): 0.36031228,
    (('table_1', 'baseline/tumor_stage_pathological'), ('table_2', 'ajcc_pathologic_stage')): 0.516451,
    (('table_1', 'tumor_code'), ('table_2', 'tumor_code')): 0.82753456}

One-to-one matches:
{   (('table_1', 'Case_ID'), ('table_2', 'ca

In [None]:
target_cols = df_target.columns.str.lower()
liu_cols = df_liu.columns.str.lower()

for candidate in liu_cols:
    if candidate in target_cols:
        print(f"Column {candidate} is present in both datasets")
    else:
        for target in target_cols:
            if jellyfish.jaro_similarity(candidate, target) > 0.7:
                print(f"Column {candidate} matches with {target}")
        

Column case_id matches with case_submitter_id
Column tumor_code is present in both datasets
Column tumor_sample_id_protein matches with tumor_grade
Column tumor_sample_id_protein matches with tumor_focality
Column tumor_sample_id_protein matches with tumor_largest_dimension_diameter
Column age matches with age_at_diagnosis
Column age matches with race
Column race is present in both datasets
Column ethnicity is present in both datasets
Column cptac_path/histologic_grade matches with ajcc_pathologic_stage


### Generate a table from GDC dictionary and check for matching


In [None]:
for col in df_target.columns:

    print(f"Looking for candidates for column {col}...")
    gdc_schema = GDCSchema('clinical::'+col)
    for cand in gdc_schema.get_gdc_candidates():
        print(cand)
    print('\n')
    # entry = gdc_schema.get_properties_by_gdc_candidate(probe)
    # if isinstance(entry, dict):
    #     domain = gdc_schema.get_gdc_col_values()
    #     print(domain)
    # else:
    #     pass
    
    
    
   

Looking for candidates for column case_submitter_id...
annotation::clinical_supplements
other_clinical_attribute::cases
clinical_supplement::cases
follow_up::cases
annotation::cases
file::cases
sample::cases
family_history::cases
clinical::cases
diagnosis::cases
biospecimen_supplement::cases
exposure::cases
demographic::cases
treatment::clinical_trial_indicator


Looking for candidates for column age_at_diagnosis...
family_history::relationship_age_at_diagnosis
treatment::clinical_trial_indicator
diagnosis::eln_risk_classification
clinical::age_at_diagnosis
diagnosis::age_at_diagnosis


Looking for candidates for column race...
treatment::clinical_trial_indicator


Looking for candidates for column ethnicity...
treatment::clinical_trial_indicator
exposure::alcohol_intensity


Looking for candidates for column gender...
annotation::clinical_supplements


Looking for candidates for column vital_status...
demographic::marital_status
clinical::vital_status
demographic::vital_status
annotat

In [None]:
gdc_schema = GDCSchema('clinical::case_id')
for cand in gdc_schema.get_gdc_candidates():
    print(cand)
print('\n')
gdc_schema = GDCSchema('case_id')
for cand in gdc_schema.get_gdc_candidates():
    print(cand)

treatment::clinical_trial_indicator
annotation::clinical_supplements


other_clinical_attribute::cases
clinical_supplement::cases
follow_up::cases
annotation::cases
file::cases
sample::cases
family_history::cases
clinical::cases
diagnosis::cases
biospecimen_supplement::cases
exposure::cases
demographic::cases
demographic::cause_of_death
tissue_source_site::bcr_id
raw_methylation_array::chip_id
masked_methylation_array::chip_id
demographic::cause_of_death_source
germline_mutation_calling_workflow::batch_id
copy_number_estimate::batch_id
analyte::batch_id
pathology_detail::batch_id
aligned_reads::batch_id
other_clinical_attribute::batch_id
read_group_qc::batch_id
aliquot::batch_id
genomic_profile_harmonization_workflow::batch_id
copy_number_segment::batch_id
archive::batch_id
simple_somatic_mutation::batch_id
filtered_copy_number_segment::batch_id
methylation_liftover_workflow::batch_id
protein_expression::batch_id
case::batch_id
pathology_report::batch_id
somatic_annotation_workflow::ba

In [None]:

gdc_schema.get_properties_by_gdc_candidate('other_clinical_attribute::cases')

{'anyOf': [{'type': 'array',
   'items': {'minItems': 1,
    'maxItems': 1,
    'type': 'object',
    'additionalProperties': True,
    'properties': {'id': {'type': 'string',
      'pattern': '^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$',
      'systemAlias': 'node_id',
      'common': {'description': 'A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different. Its relatively small size lends itself well to sorting, ordering, and hashing of all sorts, storing in databases, simple allocation, and ease of programming in general.',
       'termDef': {'term': 'Universally Unique Identifier',
        'source': 'NCIt',
        'cde_id': 'C54100',
        'cde_version': None,
        'term_url': 'https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&version=16.02d&ns=NCI_Thesaurus&code=C54100'}}},

In [None]:
gdc_schema = GDCSchema('ajcc_pathologic_stage')
gdc_schema.candidates.keys()

dict_keys(['diagnosis::ajcc_pathologic_stage', 'diagnosis::ajcc_pathologic_t', 'diagnosis::uicc_pathologic_stage', 'diagnosis::ajcc_pathologic_m', 'diagnosis::ajcc_pathologic_n', 'diagnosis::uicc_pathologic_t', 'diagnosis::uicc_pathologic_m', 'diagnosis::uicc_pathologic_n', 'diagnosis::ensat_pathologic_stage', 'diagnosis::ajcc_clinical_stage', 'diagnosis::ann_arbor_pathologic_stage'])