In [None]:
import functools
import itertools
import os
from typing import Dict, List, Optional, Set

import pandas as pd
from sklearn.metrics import auc, precision_recall_curve
import textdistance

from fm_matcher.utils.models import Attribute, Parameters, Relation

In [None]:
benchmark = pd.read_csv("benchmark/ground_truth.csv")
for side in ("source", "target"):
    benchmark[[f"{side}_schema", f"{side}_relation", f"{side}_attribute"]] = benchmark[side].str.split(".", expand=True)
    benchmark[side] = benchmark[side].str.lower()
benchmark["benchmark"] = True

relation_combinations = benchmark[["source_relation", "target_relation"]].drop_duplicates().values

In [None]:
@functools.cache
def get_description(schema: str, table: str, attribute: Optional[str] = None) -> str:
    if attribute:
        filename = f"{schema}_{table}_{attribute}.txt"
    else:
        filename = f"{schema}_table_{table}.txt"
    filename_filter = lambda f: f.lower() == filename.lower()
    filename = next(filter(filename_filter, os.listdir("schema_documentations")))
    with open(os.path.join("schema_documentations", filename), "r") as desc_file:
        description = desc_file.read()
    return description


@functools.cache
def get_attributes(schema: str, table: str) -> List[str]:
    table_filter = lambda f: f.lower().startswith(f"{schema}_{table}_") and f.endswith(".txt")
    extract_attr_name = lambda f: f[len(f"{schema}_{table}_"):-len(".txt")]
    return [
        extract_attr_name(attr_file)
        for attr_file in filter(table_filter, os.listdir("schema_documentations/"))
    ]


@functools.cache
def get_relation(schema: str, table: str) -> Relation:
    schema, table = schema.lower(), table.lower()
    description = get_description(schema, table)
    attributes = [
        Attribute(
            name=attr_name.capitalize(),
            description=get_description(schema, table, attr_name),
        ) for attr_name in get_attributes(schema, table)
    ]
    return Relation(
        name=table.capitalize(),
        description=description,
        attributes=attributes,
    )


parameters = [
    Parameters(
        source_relation=get_relation("mimic", source),
        target_relation=get_relation("omop", target),
    ) for source, target in relation_combinations
]

In [None]:
def get_ngrams(s: str, n: int = 3) -> Set[str]:
    # as by [Sun et al.](www.doi.org/10.12733/jics20105420)
    full_s = f"{'#' * (n-1)}{s}{'%' * (n-1)}"
    return {full_s[i:i+n] for i in range(len(full_s) - n + 1)}


baseline_data = []
for param in parameters:
    for source, target in itertools.product(param.source_relation.attributes, param.target_relation.attributes):
        source_attr = source.name.lower()
        target_attr = target.name.lower()
        baseline_data.append({
            "source": f"mimic.{param.source_relation.name}.{source.name}".lower(),
            "source_relation": param.source_relation.name,
            "source_attribute": source.name.lower(),
            "target": f"omop.{param.target_relation.name}.{target.name}".lower(),
            "target_relation": param.target_relation.name,
            "target_attribute": target.name.lower(),
            "jaro-winkler": textdistance.jaro_winkler.normalized_similarity(source_attr, target_attr),
            "levenshtein": textdistance.levenshtein.normalized_similarity(source_attr, target_attr),
            "monge-elkan": textdistance.monge_elkan.normalized_similarity(source_attr, target_attr),
            "n-gram": textdistance.sorensen.normalized_similarity(get_ngrams(source_attr, 3), get_ngrams(target_attr, 3)),
        })

baseline_df = pd.DataFrame(baseline_data)
baseline_df = baseline_df.merge(benchmark[["source", "target", "benchmark"]], on=["source", "target"], how="left").copy()
baseline_df["benchmark"] = baseline_df["benchmark"].fillna(False)
baseline_df.to_csv("results/baseline_results.csv", index=False)
baseline_df

In [None]:
import plotly.graph_objects as go

fig = go.Figure(
    layout={
        "width": 1000,
        "height": 600,
        "title": "Precision-Recall curve of different string similarity metrics.",
        "yaxis": {"title": "precision"},
        "xaxis": {"title": "recall"},
    }
)
    
for metric in ["jaro-winkler", "levenshtein", "monge-elkan", "n-gram"]:
    precision, recall, thresholds = precision_recall_curve(
        baseline_df["benchmark"],
        baseline_df[metric],
        pos_label=True,
    )
    fig.add_trace(
        go.Scatter(
            x=recall,
            y=precision,
            name=f"{metric} (AUC: {auc(recall, precision):.2f})"
        )
    )

fig.show()