# Benchmark Overview

This notebook generates an overview of the benchmark: the table combinations (aka the datsets), the number of attributes and attribute pairs and the number of true semantic matches found in the ground truth.

In [None]:
import functools
import os
from typing import Dict, List, Optional

import pandas as pd

from fm_matcher.utils.models import Attribute, Parameters, Relation

In [None]:
benchmark = pd.read_csv("benchmark/ground_truth.csv")
for side in ("source", "target"):
    benchmark[[f"{side}_schema", f"{side}_relation", f"{side}_attribute"]] = benchmark[side].str.split(".", expand=True)
    benchmark[side] = benchmark[side].str.lower()
benchmark["benchmark"] = True

relation_combinations = benchmark[["source_relation", "target_relation"]].drop_duplicates().values

In [None]:
@functools.cache
def get_description(schema: str, table: str, attribute: Optional[str] = None) -> str:
    if attribute:
        filename = f"{schema}_{table}_{attribute}.txt"
    else:
        filename = f"{schema}_table_{table}.txt"
    filename_filter = lambda f: f.lower() == filename.lower()
    filename = next(filter(filename_filter, os.listdir("schema_documentations")))
    with open(os.path.join("schema_documentations", filename), "r") as desc_file:
        description = desc_file.read()
    return description


@functools.cache
def get_attributes(schema: str, table: str) -> List[str]:
    table_filter = lambda f: f.lower().startswith(f"{schema}_{table}_") and f.endswith(".txt")
    extract_attr_name = lambda f: f[len(f"{schema}_{table}_"):-len(".txt")]
    return [
        extract_attr_name(attr_file)
        for attr_file in filter(table_filter, os.listdir("schema_documentations/"))
    ]


@functools.cache
def get_relation(schema: str, table: str) -> Relation:
    schema, table = schema.lower(), table.lower()
    description = get_description(schema, table)
    attributes = [
        Attribute(
            name=attr_name.capitalize(),
            description=get_description(schema, table, attr_name),
        ) for attr_name in get_attributes(schema, table)
    ]
    return Relation(
        name=table.capitalize(),
        description=description,
        attributes=attributes,
    )


parameters = [
    Parameters(
        source_relation=get_relation("mimic", source),
        target_relation=get_relation("omop", target),
    ) for source, target in relation_combinations
]

In [None]:
def abbreviate(source: str, target: str) -> str:
    return {
        ("patients", "person"): "PaPe",
        ("admissions", "visit_occurrence"): "AdVO",
        ("prescriptions", "drug_exposure"): "PrDE",
        ("admissions", "condition_occurrence"): "AdCO",
        ("diagnoses_icd", "condition_occurrence"): "DiCO",
        ("labevents", "measurement"): "LaMe",
        ("admissions", "visit_detail"): "AdVD",
        ("services", "visit_detail"): "SeVD",
        ("transfers", "visit_detail"): "TrVD",
    }[(source.lower(), target.lower())]


dataset_table = []
for param in parameters:
    dataset_table.append({
        "dataset": abbreviate(param.source_relation.name, param.target_relation.name),
        "source": param.source_relation.name,
        "source_attributes": len(param.source_relation.attributes),
        "target": param.target_relation.name,
        "target_attributes": len(param.target_relation.attributes),
        "attribute_pairs": len(param.source_relation.attributes) * len(param.target_relation.attributes),
        "matches": benchmark.query((
            "(source_relation.str.lower() == @param.source_relation.name.lower()) and "
            "(target_relation.str.lower() == @param.target_relation.name.lower()) and "
            "benchmark"
        )).shape[0],
    })
dataset_df = pd.DataFrame(dataset_table).set_index("dataset")
dataset_df.sort_index()