In [1]:
import bdikit as bdi
import pandas as pd
from IPython.display import display, Markdown
pd.set_option('display.max_rows', None)

In [2]:
df_source = pd.read_csv("./datasets/Dou-ucec-discovery.csv")
df_target = pd.read_csv("./datasets/Dou-ucec-confirmatory.csv")
df_gt = pd.read_csv("./datasets/Dou-ucec-ground-truth.csv")

In [3]:
df_ground_truth = df_gt[~df_gt['target'].isna()]
gt_set = df_ground_truth.set_index('source')['target'].to_dict()

In [4]:
def accuracy(gt_set, schema_mapping):
    schema_dict = schema_mapping.set_index('source')['target'].to_dict()
    correct_count = 0
    total = 0
    for source_column, target_column in  schema_dict.items():
        if source_column in gt_set:
            total += 1
            correct_target_columns = set(gt_set[source_column].split(","))
            if target_column in correct_target_columns:
                correct_count += 1
                # print(f"OK: {source_column} -> out={target_column} gt={correct_target_columns}")
            else:
                print(f"ER: {source_column} -> out={target_column} gt={correct_target_columns}")
    return correct_count/ float(total)

In [5]:
df_source.columns = df_source.columns.str.replace(' ', '')

In [6]:
schema_mapping = bdi.match_schema(df_source, df_target, method='ct_learning')
print(f"Recall: {100*accuracy(gt_set, schema_mapping):.3f}")
schema_mapping

Extracting features from 179 columns...


  0%|          | 0/179 [00:00<?, ?it/s]

Extracting features from 213 columns...


  0%|          | 0/213 [00:00<?, ?it/s]

ER: Proteomics_Participant_ID -> out=Idx gt={'Case_id'}
ER: Proteomics_TMT_batch -> out=Metformin_treatment gt={'Batch'}
ER: Proteomics_TMT_plex -> out=Number_of_para-aortic_lymph_nodes_positive_for_tumor_by_he gt={'Plex'}
ER: Proteomics_TMT_channel -> out=Number_of_para-aortic_lymph_nodes_positive_for_tumor_by_he gt={'ReporterName'}
ER: Treatment_naive -> out=Follow-up_additional_treatment_radiation_therapy_for_new_tumor gt={'Cancer_history_history_of_any_treatment'}
ER: Tumor_purity -> out=Tumor_necrosis gt={'ABSOLUTE_tumor_purity'}
ER: MSH6 -> out=Ancillary_studies_msh2 gt={'Ancillary_studies_msh6'}
ER: CIBERSORT_Monocytes -> out=Cibersort_Macrophage_M2 gt={'Cibersort_Monocyte'}
ER: CIBERSORT_Eosinophils -> out=xCell_Cancer_associated_fibroblast gt={'Cibersort_Eosinophil'}
ER: CIBERSORT_Neutrophils -> out=CNV_ratio gt={'Cibersort_Neutrophil'}
ER: ESTIMATE_ImmuneScore -> out=Estimate_ESTIMATEScore gt={'Estimate_ImmuneScore'}
Recall: 75.556


Unnamed: 0,source,target
0,idx,xCell_T_cell_CD4+_Th1
1,Proteomics_Participant_ID,Idx
2,Case_excluded,Case_excluded
3,Proteomics_TMT_batch,Metformin_treatment
4,Proteomics_TMT_plex,Number_of_para-aortic_lymph_nodes_positive_for...
5,Proteomics_TMT_channel,Number_of_para-aortic_lymph_nodes_positive_for...
6,Proteomics_Parent_Sample_IDs,Idx
7,Proteomics_Aliquot_ID,Aliquot_ID
8,Proteomics_Tumor_Normal,Group
9,Proteomics_OCT,POLE


In [6]:
schema_mapping = bdi.match_schema(df_source, df_target, method='max_val_sim')
print(f"Recall: {100*accuracy(gt_set, schema_mapping):.3f}")
schema_mapping

Extracting features from 179 columns...


  0%|          | 0/179 [00:00<?, ?it/s]

Extracting features from 213 columns...


  0%|          | 0/213 [00:00<?, ?it/s]

ER: Proteomics_TMT_batch -> out=Metformin_treatment gt={'Batch'}
ER: Proteomics_TMT_plex -> out=Number_of_para-aortic_lymph_nodes_positive_for_tumor_by_he gt={'Plex'}
ER: Treatment_naive -> out=Follow-up_additional_treatment_radiation_therapy_for_new_tumor gt={'Cancer_history_history_of_any_treatment'}
ER: Tumor_purity -> out=Tumor_necrosis gt={'ABSOLUTE_tumor_purity'}
ER: MSH6 -> out=Ancillary_studies_msh2 gt={'Ancillary_studies_msh6'}
ER: CIBERSORT_Monocytes -> out=Cibersort_Macrophage_M2 gt={'Cibersort_Monocyte'}
ER: CIBERSORT_Eosinophils -> out=xCell_Cancer_associated_fibroblast gt={'Cibersort_Eosinophil'}
ER: CIBERSORT_Neutrophils -> out=CNV_ratio gt={'Cibersort_Neutrophil'}
ER: ESTIMATE_ImmuneScore -> out=Estimate_ESTIMATEScore gt={'Estimate_ImmuneScore'}
Recall: 80.000


Unnamed: 0,source,target
0,idx,Number_of_para-aortic_lymph_nodes_examined
1,Proteomics_Participant_ID,Case_id
2,Case_excluded,Case_excluded
3,Proteomics_TMT_batch,Metformin_treatment
4,Proteomics_TMT_plex,Number_of_para-aortic_lymph_nodes_positive_for...
5,Proteomics_TMT_channel,ReporterName
6,Proteomics_Parent_Sample_IDs,Case_id
7,Proteomics_Aliquot_ID,Aliquot_ID
8,Proteomics_Tumor_Normal,Group
9,Proteomics_OCT,POLE


In [6]:
bdi.top_matches(df_source, columns=["Proteomics_Participant_ID"], target=df_target, top_k=5)

Extracting features from 1 columns...


  0%|          | 0/1 [00:00<?, ?it/s]

Extracting features from 213 columns...


  0%|          | 0/213 [00:00<?, ?it/s]

Unnamed: 0,source,target,similarity
0,Proteomics_Participant_ID,Idx,0.173261
1,Proteomics_Participant_ID,Case_id,0.158389
2,Proteomics_Participant_ID,Aliquot_ID,0.123715
3,Proteomics_Participant_ID,xCell_T_cell_CD8+_central_memory,0.103956
4,Proteomics_Participant_ID,Progeny_p53,0.102078


In [25]:
bdi.preview_domain(df_target, "Cibersort_T_cell_CD8+").head(5)

Unnamed: 0,value_name
0,0.201397
1,0.0596
2,0.0324
3,0.0555
4,0.107774


In [11]:
bdi.preview_domain(df_source, "TP53_TP53").head(5)

Unnamed: 0,value_name
0,1.0
1,0.0
2,


In [10]:
bdi.preview_domain(df_target, "TP53").head(5)

Unnamed: 0,value_name
0,WT
1,Mutated
2,


In [9]:
bdi.match_values(df_source, df_target, ("TP53_TP53", "TP53"), method='tfidf')

Unnamed: 0,source,target,similarity
