In [64]:
import seaborn as sns
import pandas as pd
from valentine.algorithms import Coma
from valentine.algorithms import JaccardDistanceMatcher
from valentine.algorithms import SimilarityFlooding
from valentine.algorithms import DistributionBased
from valentine.algorithms import Cupid
from valentine.metrics import F1Score, PrecisionTopNPercent
from valentine import valentine_match
import pprint
import glob
from column_representation import ColumnRepresentation, gdc_target_columns, gdc_df

from polyfuzz import PolyFuzz
from polyfuzz.models import EditDistance
from jellyfish import jaro_winkler_similarity
from polyfuzz.models import Embeddings
from flair.embeddings import TransformerWordEmbeddings, WordEmbeddings

from gdc.gdc_api import GDCSchema
import utils.file_utils as ut
import sys
import os
sys.path.append(os.path.join(os.path.abspath(''), '..'))


sns.set_theme(style="white")


pp = pprint.PrettyPrinter(indent=4, sort_dicts=True)

ground_truth_path = os.path.join(
    '..', 'data', 'table-matching-ground-truth', 'ground-truth')
csv_data_path = os.path.join('..', 'data', 'extracted-tables')

cat_null_values = set(['n/a', 'na', 'nan', 'null', 'lost to follow-up', 'not available', 'unable to obtain',
                       'unknown', 'not reported', 'not allowed to collect',
                       'unspecified', 'not specified', 'other, specify','none'])

DOMAIN_SIMILARITY_THRESHOLD = 0.5

In [65]:
df_target = pd.read_csv('../data/target.csv')
df_target.drop('study', axis=1, inplace=True)
print(df_target.shape)
df_target.head(5)

(1068, 16)


Unnamed: 0,case_submitter_id,age_at_diagnosis,race,ethnicity,gender,vital_status,ajcc_pathologic_t,ajcc_pathologic_n,ajcc_pathologic_stage,tumor_grade,tumor_focality,tumor_largest_dimension_diameter,primary_diagnosis,morphology,tissue_or_organ_of_origin,tumor_code
0,01BR001,20089.0,black or african american,not hispanic or latino,female,Alive,T2,N1c,Stage II,GX,Not Reported,Not Reported,Invasive carcinoma of no special type,8500/3,"Breast, NOS",BRCA
1,01BR008,17532.0,black or african american,not hispanic or latino,female,Not Reported,Not Reported,Not Reported,Not Reported,GX,Not Reported,Not Reported,Not Reported,Not Reported,"Breast, NOS",BRCA
2,01BR009,23376.0,black or african american,not hispanic or latino,female,Not Reported,Not Reported,Not Reported,Not Reported,GX,Not Reported,Not Reported,Not Reported,Not Reported,"Breast, NOS",BRCA
3,01BR010,23741.0,black or african american,not hispanic or latino,female,Not Reported,Not Reported,Not Reported,Not Reported,GX,Not Reported,Not Reported,Not Reported,Not Reported,"Breast, NOS",BRCA
4,01BR015,12784.0,white,not hispanic or latino,female,Alive,T2,N1,Stage II,GX,Not Reported,Not Reported,Invasive carcinoma of no special type,8500/3,"Breast, NOS",BRCA


In [66]:

interested_cols = set(df_target.columns)
interested_cols.remove('case_submitter_id')
interested_cols

{'age_at_diagnosis',
 'ajcc_pathologic_n',
 'ajcc_pathologic_stage',
 'ajcc_pathologic_t',
 'ethnicity',
 'gender',
 'morphology',
 'primary_diagnosis',
 'race',
 'tissue_or_organ_of_origin',
 'tumor_code',
 'tumor_focality',
 'tumor_grade',
 'tumor_largest_dimension_diameter',
 'vital_status'}

In [67]:
gdc_df
gdc_df.ajcc_pathologic_stage.unique()

array(['stage iva', 'stage ib', 'stage x', 'stage ivb', 'stage ia3',
       'stage ic', 'stage i', 'stage ib2', 'stage iic', 'stage 0a',
       'stage 0', 'stage iiic2', 'stage iii', 'stage ii', 'stage ia1',
       'stage is', 'stage tis', 'stage iib', 'stage iiia', 'stage ia2',
       'stage ivc', 'stage iiic', 'stage ia', 'stage ib1', 'stage 0is',
       'stage iia1', 'stage iiid', 'stage iiia1', 'stage iiib',
       'stage iiia2', 'stage iiic1', 'stage iv', 'stage iia2',
       'stage iia', <NA>], dtype=object)

In [68]:
def calculate_metrics(groundtruth, matching_pairs):
    
    true_positives = len(set(groundtruth) & set(matching_pairs))
    false_positives = len(set(matching_pairs) - set(groundtruth))
    false_negatives = len(set(groundtruth) - set(matching_pairs))
    
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives) 
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    return precision, recall, f1_score

In [69]:
def check_domains_tf(colvalues1, colvalues2):
    model = PolyFuzz("TF-IDF").match(colvalues1, colvalues2)
    value_matches = model.get_matches()
    # model.visualize_precision_recall()
    # print(value_matches)
    value_matches = value_matches.dropna()
    # print(value_matches)
    n = len(colvalues1)
    # get the first elements according to the domain size of the candidate
    value_matches = value_matches.head(n)

    return value_matches['Similarity'].median()


def check_domains_edit(colvalues1, colvalues2):
    jellyfish_matcher = EditDistance(n_jobs=1, scorer=jaro_winkler_similarity)
    model = PolyFuzz(jellyfish_matcher).match(colvalues1, colvalues2)
    value_matches = model.get_matches()
    # print(value_matches)
    value_matches = value_matches.dropna()

    n = len(colvalues1)
    # get the first elements according to the domain size of the candidate
    value_matches = value_matches.head(n)

    return value_matches['Similarity'].median()


def check_domains_emb(colvalues1, colvalues2):

    bert = TransformerWordEmbeddings('bert-base-multilingual-cased')
    bert_matcher = Embeddings(bert, min_similarity=0)

    # fasttext = WordEmbeddings('en-crawl')
    # fasttext_matcher = Embeddings(fasttext, min_similarity=0)

    # matchers = [bert_matcher, fasttext_matcher]

    model = PolyFuzz(bert_matcher).match(colvalues1, colvalues2)

    value_matches = model.get_matches()

    print(value_matches)

    # value_matches = value_matches.dropna()

    # n = len(colvalues1)
    # # get the first elements according to the domain size of the candidate
    # value_matches = value_matches.head(n)

    # return value_matches['Similarity'].median()

In [70]:
scores = {}
for file in os.listdir(ground_truth_path):

    # if 'Dou' not in file:
    #     continue

    study = file.split('.csv')[0]

    groundtruth_path = os.path.join(ground_truth_path, file)
    groundtruth = ut.load_table_matching_groundtruth(groundtruth_path)
    groundtruth = [
        entry for entry in groundtruth if entry[1] in set(gdc_df.columns)]
    groundtruth = [(entry[0].lower(), entry[1].lower()) for entry in groundtruth]
    print(len(groundtruth))
    print(groundtruth)

    files = glob.glob(os.path.join(csv_data_path, f"{study}*.csv"))

    matching_pairs = []

    for i, file in enumerate(files):
        
        

        print(f"\nProcessing {file}")
        candidate_df = pd.read_csv(file)
        candidate_df.columns = candidate_df.columns.str.lower()
        candidate_df = candidate_df.select_dtypes(exclude='number')

        # Exact match based on column names
        common_cols = list(
            set(candidate_df.columns).intersection(set(gdc_df.columns)))
        
        print('\nExact column name matches between candidate table and gdc', common_cols)
        for col in common_cols:
            print('Checking similarity values for column:', col)
            candidate_col_values =  candidate_df[col].dropna().astype(str).unique()
            candidate_col_values = [c.lower() for c in candidate_col_values if c.lower() not in cat_null_values]# remove null values and make it lower case
            target_col_values = gdc_df[col].dropna().astype(str).unique()
            target_col_values = [c.lower() for c in target_col_values] # mae it lower case
            domain_similarity_avg = check_domains_tf(candidate_col_values, target_col_values)
            if domain_similarity_avg > DOMAIN_SIMILARITY_THRESHOLD:
                matching_pairs.append((col, col))

            print(f"Domain similarity for {col} is {domain_similarity_avg}")
            print('-'*60)
        print('\n')

        ## Checking similarity based on column names

        # TF-IDF      
        model = PolyFuzz("TF-IDF").match(list(candidate_df.columns), list(gdc_df.columns))
        name_matches = model.get_matches()
        # Edit Distance model
        # jellyfish_matcher = EditDistance(n_jobs=1, scorer=jaro_winkler_similarity)
        # model = PolyFuzz(jellyfish_matcher).match(list(candidate_df.columns), list(gdc_df.columns))
        # name_matches = model.get_matches()
                                          
        name_matches = name_matches[(name_matches['Similarity'] >= 0.4) & (name_matches['Similarity'] < 1.0)] ## get similar matches
        print('Similar column name matches between candidate table and gdc')
        print(name_matches) 
        print('\n')

        for match in name_matches.itertuples():
            print('Checking similarity values for column:', match[1], match[2])
            candidate_col_values =  candidate_df[match[1]].dropna().astype(str).unique()
            candidate_col_values = [c.lower() for c in candidate_col_values if c.lower() not in cat_null_values]
            
            
            target_col_values = gdc_df[match[2]].dropna().astype(str).unique()
            target_col_values = [c.lower() for c in target_col_values ]
            
            domain_similarity_avg = check_domains_tf(candidate_col_values, target_col_values)
            if domain_similarity_avg > DOMAIN_SIMILARITY_THRESHOLD:
                matching_pairs.append((match[1], match[2]))
            print(f"Domain similarity for {match[1]} and {match[2]} is {domain_similarity_avg}")
            print('-'*60) 


        # checking the remaining of the candidates
        print('\nChecking similarity values for remaining columns')
            
        candidate_cols = set(candidate_df.columns)
        remaining_cols = list(candidate_cols - set([pair[0] for pair in matching_pairs]))

        target_cols = set(gdc_df.columns)
        remaining_target_cols = list(target_cols - set([pair[1] for pair in matching_pairs]))
        
        pairwise_elements = list(zip(remaining_cols, remaining_target_cols))
        print(pairwise_elements)

        for pair in pairwise_elements:
            # print('\nChecking similarity values for column:', pair[0], pair[1])
            candidate_col_values =  candidate_df[pair[0]].dropna().astype(str).unique()
            candidate_col_values = [c.lower() for c in candidate_col_values if c.lower() not in cat_null_values]
            
            target_col_values = gdc_df[pair[1]].dropna().astype(str).unique()
            target_col_values = [c.lower() for c in target_col_values ]
            domain_similarity_avg = check_domains_tf(candidate_col_values, target_col_values)
            # domain_similarity_avg = check_domains_emb(candidate_col_values, target_col_values)
            if domain_similarity_avg > DOMAIN_SIMILARITY_THRESHOLD:
                matching_pairs.append(pair)
            print(f"Domain similarity for {pair[0]} and {pair[1]} is {domain_similarity_avg}")
            print('-'*60)
            break
        

    print(f'\nFor {study}, we have:')
    print('Groundtruth:', groundtruth)
    print('Found pairs', matching_pairs)
    precision, recall, f1_score = calculate_metrics(groundtruth, matching_pairs)
    print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1_score}")
    scores[study] = (precision, recall, f1_score)

        # ## check remaining of ground truth
        # remaining_pairs = list(set(groundtruth) - set(matching_pairs))
        # print('Remaining pairs:', remaining_pairs)

        # for pair in remaining_pairs:
        #     print('\nChecking similarity values for column:', pair[0], pair[1])
        #     candidate_col_values =  candidate_df[pair[0]].dropna().astype(str).unique()
        #     candidate_col_values = [c.lower() for c in candidate_col_values if c.lower() not in cat_null_values]
            
        #     target_col_values = gdc_df[pair[1]].dropna().astype(str).unique()
        #     target_col_values = [c.lower() for c in target_col_values ]
        #     domain_similarity_avg = check_domains_tf(candidate_col_values, target_col_values)
        #     if domain_similarity_avg > DOMAIN_SIMILARITY_THRESHOLD:
        #         matching_pairs.append(pair)
        #     print(f"Domain similarity for {pair[0]} and {pair[1]} is {domain_similarity_avg}")
        #     print('-'*60)

        # print('\nSo far, we have:')
        # print('Groundtruth:', groundtruth)
        # print('Found pairs', matching_pairs)
        # precision, recall, f1_score = calculate_metrics(groundtruth, matching_pairs)
        # print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1_score}")
print('\n')
for k,v in scores.items():
    precision, recall, f1_score = v
    print(f'{k}: Precision: {precision}, Recall: {recall}, F1 Score: {f1_score}')

3
[('tumor.stage', 'ajcc_pathologic_stage'), ('gender', 'gender'), ('ethnicity', 'ethnicity')]

Processing ../data/extracted-tables/Krug_D)_QC_mutation_calls.csv



Exact column name matches between candidate table and gdc []


Similar column name matches between candidate table and gdc
                   From          To  Similarity
3  tumor_sample_barcode  tumor_code       0.474


Checking similarity values for column: tumor_sample_barcode tumor_code
Domain similarity for tumor_sample_barcode and tumor_code is nan
------------------------------------------------------------

Checking similarity values for remaining columns
[('variant_type', 'primary_diagnosis'), ('hugo_symbol', 'tumor_focality'), ('variant_classification', 'morphology'), ('passed_filters', 'tumor_grade'), ('tumor_sample_barcode', 'ethnicity'), ('matched_norm_sample_barcode', 'ajcc_pathologic_stage')]
Domain similarity for variant_type and primary_diagnosis is 0.3665
------------------------------------------------------------

Processing ../data/extracted-tables/Krug_A)_Metadata.csv

Exact column name matches between candidate table and gdc ['ethnicity', 'gender']
Checking simi