## Create map for ground truth data

In [2]:
from tqdm import tqdm
import pandas as pd
import os

In [3]:
gt_path = "../../data/table-matching-ground-truth/ground-truth"
gt_map = pd.DataFrame(columns=['table_name', 'l_column_id', 'r_column_id'])
rows = []

for filename in os.listdir(gt_path):
    table_name = os.path.splitext(filename)[0]
    table = pd.read_csv(os.path.join(gt_path, filename))
    for index, row in table.iterrows():
        rows.append({'table_name': table_name, 'l_column_id': row['original_paper_variable_names'], 'r_column_id': row['GDC_format_variable_names']})

In [4]:
gt_map = pd.concat([gt_map, pd.DataFrame(rows)], ignore_index=True)
# gt_map.to_csv("../data/test.csv", index=False)

## Create gt tables

In [5]:
input_dir = "../../data/extracted-tables"
# create a dictionary of pandas dataframes
tables = {}

for filename in tqdm(os.listdir(input_dir), desc="Annotating columns"):
    if filename.endswith(".csv"):
        file_path = os.path.join(input_dir, filename)
        df = pd.read_csv(file_path)
        table_name = filename.split("_")[0]
        if table_name not in tables:
            tables[table_name] = pd.DataFrame()
        for col_index, col_name in enumerate(df.columns):
            if not gt_map[(gt_map['table_name'] == table_name) & (gt_map['l_column_id'] == col_name)].empty:
                tables[table_name][col_name] = df[col_name]
        
# store each dataframe in tables dictionary as a csv file
output_dir = "../data/annotated-tables"
for table_name, table in tqdm(tables.items(), desc="Saving annotated tables"):
    table.to_csv(os.path.join(output_dir, table_name + ".csv"), index=False)

Annotating columns: 100%|██████████| 32/32 [00:00<00:00, 85.26it/s]
Saving annotated tables: 100%|██████████| 10/10 [00:00<00:00, 753.46it/s]


## Modify map for synthetic data

In [7]:
synthetic_table = pd.read_csv("../data/train.csv")
# add two columns to the synthetic table, "l_table_id" and "r_table_id".
# All values for 'l_table_id' will be "gdc_table", and all values for 'r_table_id' will be "gdc_table_synthetic".
synthetic_table['l_table_id'] = "gdc_table"
synthetic_table['r_table_id'] = "gdc_table_synthetic"
synthetic_table.to_csv("../data/train.csv", index=False)