In [43]:
import sys
import os
import pandas as pd
import sys
import glob
import statistics
import pprint
import timeit
import datetime
pp = pprint.PrettyPrinter(indent=4, sort_dicts=False)

from valentine.metrics import F1Score, PrecisionTopNPercent
from valentine import valentine_match
from valentine.algorithms import JaccardDistanceMatcher
from valentine.algorithms import Coma
from valentine.algorithms import SimilarityFlooding
from valentine.algorithms import DistributionBased
from valentine.algorithms import Cupid

sys.path.append('..')
import utils.file_utils as ut

ground_truth_path = os.path.join( '..', 'data','table-matching-ground-truth', 'ground-truth')
csv_data_path = os.path.join( '..', 'data','extracted-tables')
target_path = os.path.join( '..', 'data','target.csv')

TARGET_TABLE_NAME = 'GDC_format_variable_names' 
CANDIDATE_TABLE_NAME = 'original_paper_variable_names'

In [44]:
target_df = pd.read_csv(target_path)
# target_df.head(5)
target_df = target_df.drop('study', axis=1)
num_target_cols = len(target_df.columns)
target_df.head(5)

Unnamed: 0,case_submitter_id,age_at_diagnosis,race,ethnicity,gender,vital_status,ajcc_pathologic_t,ajcc_pathologic_n,ajcc_pathologic_stage,tumor_grade,tumor_focality,tumor_largest_dimension_diameter,primary_diagnosis,morphology,tissue_or_organ_of_origin,tumor_code
0,01BR001,20089.0,black or african american,not hispanic or latino,female,Alive,T2,N1c,Stage II,GX,Not Reported,Not Reported,Invasive carcinoma of no special type,8500/3,"Breast, NOS",BRCA
1,01BR008,17532.0,black or african american,not hispanic or latino,female,Not Reported,Not Reported,Not Reported,Not Reported,GX,Not Reported,Not Reported,Not Reported,Not Reported,"Breast, NOS",BRCA
2,01BR009,23376.0,black or african american,not hispanic or latino,female,Not Reported,Not Reported,Not Reported,Not Reported,GX,Not Reported,Not Reported,Not Reported,Not Reported,"Breast, NOS",BRCA
3,01BR010,23741.0,black or african american,not hispanic or latino,female,Not Reported,Not Reported,Not Reported,Not Reported,GX,Not Reported,Not Reported,Not Reported,Not Reported,"Breast, NOS",BRCA
4,01BR015,12784.0,white,not hispanic or latino,female,Alive,T2,N1,Stage II,GX,Not Reported,Not Reported,Invasive carcinoma of no special type,8500/3,"Breast, NOS",BRCA


In [45]:
result_file = os.path.join('..', 'results', 'table_matching_results', 'results.csv')
result_columns = ['Study', 'Method', 'F1', 'Precision', 'Recall', 'RecallAtSizeofGroundTruth', 'Runtime(s)']

if not os.path.exists(result_file):
    df = pd.DataFrame(columns=result_columns)
    df.to_csv(result_file, index=False)

In [46]:
#TODO use config files for algorithms
matcher_config_names = ['JaccardDistanceMatcher', 'Coma','SimilarityFlooding', 'DistributionBased', 'Cupid']
# matcher_config_names = ['Coma']
def get_matcher(method):
    if method == 'JaccardDistanceMatcher':
        return JaccardDistanceMatcher()
    elif method == 'Coma':
        return Coma()
    # elif method == 'ComaInstance':
    #     return Coma(use_instances=True, java_xmx="13000") #TODO they seem to have a bug once we pass the java heap size parameter (required to execute the instance-based version of coma)
    elif method == 'SimilarityFlooding':
        return SimilarityFlooding()
    elif method == 'DistributionBased':
        return DistributionBased()
    elif method == 'Cupid':
        return Cupid()
    else:
        raise ValueError('Unknown method')

In [47]:
def get_matches(study, method):
    """
    Finds the best matching sheet for a given study by comparing it with target table.
    For each study, we compute the match score for each sheet and select the one with the highest median score.

    Args:
        study (str): The name of the study.

    Returns:
        tuple: A tuple containing the best matching sheet  and the match dictionary.

    """
    files = glob.glob(os.path.join(csv_data_path, f"{study}*.csv"))
    candidate_matches_list = []
    candidate_matches_scores = []
    for i, file in enumerate(files):
        print(f"Matching {file} sheet to {target_path} using {method}")

        matcher = get_matcher(method)
        
        candidate_df = pd.read_csv(file)
        

        matches = valentine_match(candidate_df, target_df, matcher, CANDIDATE_TABLE_NAME, TARGET_TABLE_NAME)

        l = len(matches)
        print(f'Found {l} matches')

        candidate_matches_list.append(matches)
        
        score = 0
        if len(matches.values()) > 0:
            score = statistics.median(matches.values())

        candidate_matches_scores.append(score)
    
    max_index = candidate_matches_scores.index(max(candidate_matches_scores))

    best_sheet = files[max_index]
    best_match = candidate_matches_list[max_index]
    return best_sheet, best_match

In [48]:
def process_groundtruth(study, groundtruth, matches):

    print(f'Ground truth for {study}:')
    pp.pprint(groundtruth)
    print(f'Matches for {study}:')
    for k, v in matches.items():
        lhs = k[0][1]
        rhs = k[1][1]
        match = (lhs, rhs)
        print(f"{match} : {v}")
    print("\nAccording to the ground truth:")
    
    
    metrics = matches.get_metrics(groundtruth)
    print("\nThese are the scores of the default metrics for the matcher:")
    pp.pprint(metrics)

    print('---')


In [49]:
for file in os.listdir(ground_truth_path):

    # if 'Cao' not in file:
    #     continue
    
    groundtruth_path = os.path.join(ground_truth_path, file)
    groundtruth = ut.load_table_matching_groundtruth(groundtruth_path)

    study = file.split('.csv')[0]
    

    for method in matcher_config_names:
        start = timeit.default_timer()

        sheet, matches = get_matches(study, method)

        stop = timeit.default_timer()
        runtime = round(stop - start, 4) # 4 decimal places
        
        # print(f"Best sheet for {study} using {method} is {sheet}")
        metrics = matches.get_metrics(groundtruth)
        
        entry = [study, method, metrics['F1Score'], metrics['Precision'], metrics['Recall'], metrics['RecallAtSizeofGroundTruth'], runtime]
        result_df = pd.DataFrame([entry], columns=result_columns)
        result_df.to_csv(result_file, mode='a', header=False, index=False)
        


    # break

   

Matching ../data/extracted-tables/Krug_D)_QC_mutation_calls.csv sheet to ../data/target.csv using JaccardDistanceMatcher
Found 7 matches
Matching ../data/extracted-tables/Krug_A)_Metadata.csv sheet to ../data/target.csv using JaccardDistanceMatcher
Found 23 matches
Matching ../data/extracted-tables/Krug_B)_QC_RNA-seq.csv sheet to ../data/target.csv using JaccardDistanceMatcher
Found 1 matches
Matching ../data/extracted-tables/Krug_C)_QC_WXS.csv sheet to ../data/target.csv using JaccardDistanceMatcher
Found 1 matches
Matching ../data/extracted-tables/Krug_D)_QC_mutation_calls.csv sheet to ../data/target.csv using Coma
Found 5 matches
Matching ../data/extracted-tables/Krug_A)_Metadata.csv sheet to ../data/target.csv using Coma
Found 6 matches
Matching ../data/extracted-tables/Krug_B)_QC_RNA-seq.csv sheet to ../data/target.csv using Coma
Found 3 matches
Matching ../data/extracted-tables/Krug_C)_QC_WXS.csv sheet to ../data/target.csv using Coma
Found 6 matches
Matching ../data/extracted-ta