## Index tables to pyterrier

In [1]:
import pyterrier as pt
from LLmsfJiT import connect, config, get_tables_from_qrels, read_jsonl, Table
import itertools
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pt.init()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
conf = config("../database.ini")
conn, cur = connect(conf["postgres"])

Connected to postgres


## Generate a larger subset of wtr files

In [4]:
tables = [
    Table(
        t["relation"],
        t["json_loc"],
        t["pageTitle"],
        t["title"],
        t["tableType"],
        t["hasHeader"],
        t["textBeforeTable"],
        t["textAfterTable"],
        t["entities"],
        t["headerPosition"],
        t["headerRowIndex"],
        t["tableOrientation"],
        t["url"],
        t["hasKeyColumn"],
        t["keyColumnIndex"],
    )
    for t in read_jsonl("../web_tables.json")
    ]

In [4]:
rated_tables = get_tables_from_qrels(conn, cur, "../rel_files/rel_table_qrels.txt")
tables = [rt.table for rt in rated_tables]

In [5]:
table_df = pd.DataFrame(tables)

In [6]:
def extract_header(row: pd.core.series.Series):
    header_idx = row.header_row_index
    rows = list(map(list, zip(*row.relation)))
    header = " ".join(rows[header_idx]) if header_idx != -1 else ""
    return header

table_df["header"] = table_df.apply(extract_header, axis=1)

In [7]:
test = None
def extract_key_column(row: pd.core.series.Series):
    global test
    key_col = None
    try:
        key_col_idx = row.key_column_index
        key_col = " ".join(row.relation[key_col_idx]) if key_col_idx != -1 else ""
    except:
        key_col = " ".join(row.relation[1])
    return key_col

table_df["key_column"] = table_df.apply(extract_key_column, axis=1)

In [8]:
def extract_table_content(row: pd.core.series.Series):
    return ' '.join(list(itertools.chain(*row['relation'])))

table_df["table_content"] = table_df.apply(extract_table_content, axis=1)

In [9]:
def gen_catchall(row: pd.core.series.Series):
    return ' '.join([row.table_content, row.text_before, row.text_after, row.page_title, row.title])

table_catchall = pd.concat([table_df["json_loc"], table_df.apply(gen_catchall, axis=1)], axis=1)
table_catchall.columns = ["docno", "catchall"]

In [10]:
table_catchall

Unnamed: 0,docno,catchall
0,5/1438042988718.8_20150728002308-00068-ip-10-2...,Country name Afghanistan Albania Algeria Ameri...
1,28/1438042990112.92_20150728002310-00241-ip-10...,S.No. 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 1...
2,41/1438042986451.45_20150728002306-00283-ip-10...,S.No. 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 1...
3,2/1438042981856.5_20150728002301-00265-ip-10-2...,Country Japan Singapore Bulgaria Switzerland U...
4,21/1438042987174.71_20150728002307-00309-ip-10...,Country name Afghanistan Albania Algeria Ameri...
...,...,...
6944,25/1438042987155.85_20150728002307-00277-ip-10...,Rank 101 102 103 104 105 106 107 108 109 110 1...
6945,25/1438042987155.85_20150728002307-00015-ip-10...,Region Albania Armenia Azerbaijan Belize Benin...
6946,23/1438042990609.0_20150728002310-00259-ip-10-...,Games 2014 Winter 2014 Winter Age 23 23 City S...
6947,23/1438042989043.35_20150728002309-00291-ip-10...,# 31 43 Name Jaroslav Janus Evan Oberg Sabast...


In [12]:
sf_indexer = pt.DFIndexer("../pyterrier_indexes/single_field_index_6949")
sf_indexer.index(table_catchall["catchall"], table_catchall["docno"])

<org.terrier.querying.IndexRef at 0x7efc52773390 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x557114c5b8c0 at 0x7efc5273a3f0>>

In [12]:
mf_table = table_df[["json_loc", "page_title", "text_before", "table_content", "text_after", "header"]].copy()

In [13]:
mf_table.columns = ["docno", "page_title", "text_before", "text", "text_after", "header"]

In [14]:
mf_dict = mf_table.to_dict("records")

In [15]:
mf_indexer = pt.IterDictIndexer("../rel_files/multi_field_index_topics", meta={"docno": 100})
mf_indexer.index(mf_dict)

<org.terrier.querying.IndexRef at 0x7f62a3d098b0 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x560dca184f88 at 0x7f636b9a40d0>>