# Getting Top-K Value Matches

In [1]:
import bdikit as bdi
import pandas as pd

  from tqdm.autonotebook import tqdm


In this example, we are mapping data from Dou et al. (https://pubmed.ncbi.nlm.nih.gov/37567170/) to the GDC format.

In [2]:
dataset = pd.read_csv('./datasets/dou.csv')
columns = [
    "Race",
    "Ethnicity",
    "FIGO_stage",
]

dataset[columns].head(10)

Unnamed: 0,Race,Ethnicity,FIGO_stage
0,White,Not-Hispanic or Latino,IA
1,White,Not-Hispanic or Latino,IA
2,White,Not-Hispanic or Latino,IA
3,,,
4,White,Not-Hispanic or Latino,IA
5,White,Not-Hispanic or Latino,IA
6,White,Not-Hispanic or Latino,IA
7,White,Not-Hispanic or Latino,IA
8,White,Not-Hispanic or Latino,IIIA
9,White,Not-Hispanic or Latino,IA


We can send a `Tuple (source column, target column)` as a parameter to the function `top_value_matches()`.

In [3]:
column_mapping = ('FIGO_stage', 'figo_stage')

value_mappings = bdi.top_value_matches(
        dataset,
        column_mapping=column_mapping,
        target='gdc',
        top_k=5,
        method="tfidf",
    )

In [4]:
print(f"{column_mapping[0]} => {column_mapping[1]}")
for value_mapping in value_mappings:
    display(value_mapping)
    print("")

FIGO_stage => figo_stage


Unnamed: 0,source,target,similarity
1,IA,Stage IA,0.586
0,IA,Stage IIA,0.563
3,IA,Stage IIIA,0.527
2,IA,Stage IIIAi,0.467
4,IA,Stage IIIA1,0.432





Unnamed: 0,source,target,similarity
1,IB,Stage IB,0.649
3,IB,Stage IIB,0.571
2,IB,Stage IIIB,0.528
0,IB,Stage IB1,0.441
4,IB,Stage IB2,0.441





Unnamed: 0,source,target,similarity
0,II,Stage III,0.687
1,II,Stage IIIAii,0.635
3,II,Stage IIIA,0.598
2,II,Stage IIIC,0.58
4,II,Stage IIIAi,0.566





Unnamed: 0,source,target,similarity
0,IIIA,Stage IIIA,0.822
2,IIIA,Stage IIIAii,0.726
3,IIIA,Stage IIIAi,0.716
1,IIIA,Stage IIIA2,0.674
4,IIIA,Stage IIIA1,0.674





Unnamed: 0,source,target,similarity
0,IIIB,Stage IIIB,0.849
1,IIIB,Stage IIB,0.728
3,IIIB,Stage III,0.545
2,IIIB,Stage IIIA,0.475
4,IIIB,Stage IIIAii,0.471





Unnamed: 0,source,target,similarity
0,IIIC1,Stage IIIC1,0.889
1,IIIC1,Stage IC1,0.651
3,IIIC1,Stage IIIC,0.647
2,IIIC1,Stage IIC,0.538
4,IIIC1,Stage IIIC2,0.536





Unnamed: 0,source,target,similarity
0,IIIC2,Stage IIIC2,0.889
1,IIIC2,Stage IC2,0.651
3,IIIC2,Stage IIIC,0.647
2,IIIC2,Stage IIC,0.538
4,IIIC2,Stage IIIC1,0.536





Unnamed: 0,source,target,similarity
0,IVB,Stage IVB,0.854
1,IVB,Stage IV,0.448
2,IVB,Stage IVA,0.325





Unnamed: 0,source,target,similarity
0,,Unknown,0.35





We can also send a `DataFrame` of column mappings as a parameter to `top_value_matches()`:

In [5]:
column_mappings = bdi.match_schema(dataset[columns], target="gdc", method="coma")
column_mappings

Unnamed: 0,source,target
0,FIGO_stage,figo_stage
1,Ethnicity,ethnicity
2,Race,race


In [6]:
value_mappings = bdi.top_value_matches(
        dataset,
        column_mapping=column_mappings,
        target='gdc',
        top_k=5,
        method="tfidf",
    )

In [7]:
# Grouping DataFrames by metadata (source and target columns)
from collections import defaultdict
grouped_dfs = defaultdict(list)
for value_mapping in value_mappings:
    grouped_dfs[value_mapping.attrs['source'], value_mapping.attrs['target']].append(value_mapping)

# Display grouped DataFrames
for (source_col, target_col), mappings in grouped_dfs.items():
    print(f"{source_col} => {target_col}")
    for mapping in mappings:
        display(mapping)
    print("")

FIGO_stage => figo_stage


Unnamed: 0,source,target,similarity
1,IA,Stage IA,0.586
0,IA,Stage IIA,0.563
3,IA,Stage IIIA,0.527
2,IA,Stage IIIAi,0.467
4,IA,Stage IIIA1,0.432


Unnamed: 0,source,target,similarity
1,IB,Stage IB,0.649
3,IB,Stage IIB,0.571
2,IB,Stage IIIB,0.528
0,IB,Stage IB1,0.441
4,IB,Stage IB2,0.441


Unnamed: 0,source,target,similarity
0,II,Stage III,0.687
1,II,Stage IIIAii,0.635
3,II,Stage IIIA,0.598
2,II,Stage IIIC,0.58
4,II,Stage IIIAi,0.566


Unnamed: 0,source,target,similarity
0,IIIA,Stage IIIA,0.822
2,IIIA,Stage IIIAii,0.726
3,IIIA,Stage IIIAi,0.716
1,IIIA,Stage IIIA2,0.674
4,IIIA,Stage IIIA1,0.674


Unnamed: 0,source,target,similarity
0,IIIB,Stage IIIB,0.849
1,IIIB,Stage IIB,0.728
3,IIIB,Stage III,0.545
2,IIIB,Stage IIIA,0.475
4,IIIB,Stage IIIAii,0.471


Unnamed: 0,source,target,similarity
0,IIIC1,Stage IIIC1,0.889
1,IIIC1,Stage IC1,0.651
3,IIIC1,Stage IIIC,0.647
2,IIIC1,Stage IIC,0.538
4,IIIC1,Stage IIIC2,0.536


Unnamed: 0,source,target,similarity
0,IIIC2,Stage IIIC2,0.889
1,IIIC2,Stage IC2,0.651
3,IIIC2,Stage IIIC,0.647
2,IIIC2,Stage IIC,0.538
4,IIIC2,Stage IIIC1,0.536


Unnamed: 0,source,target,similarity
0,IVB,Stage IVB,0.854
1,IVB,Stage IV,0.448
2,IVB,Stage IVA,0.325


Unnamed: 0,source,target,similarity
0,,Unknown,0.35



Ethnicity => ethnicity


Unnamed: 0,source,target,similarity
0,Hispanic or Latino,hispanic or latino,1.0
1,Hispanic or Latino,not hispanic or latino,0.956


Unnamed: 0,source,target,similarity
0,Not reported,,


Unnamed: 0,source,target,similarity
0,Not-Hispanic or Latino,not hispanic or latino,0.935
1,Not-Hispanic or Latino,hispanic or latino,0.894


Unnamed: 0,source,target,similarity
0,,,



Race => race


Unnamed: 0,source,target,similarity
0,White,white,1.0


Unnamed: 0,source,target,similarity
0,Asian,asian,1.0
1,Asian,american indian or alaska native,0.438
2,Asian,native hawaiian or other pacific islander,0.329


Unnamed: 0,source,target,similarity
0,Black or African American,black or african american,1.0
1,Black or African American,american indian or alaska native,0.605
2,Black or African American,native hawaiian or other pacific islander,0.399


Unnamed: 0,source,target,similarity
0,Not Reported,not reported,1.0


Unnamed: 0,source,target,similarity
0,,american indian or alaska native,0.359



