In [None]:
import os
os.chdir("../")

In [None]:
import json
from src import T5ModelForTableCellHighlighting
from datasets import load_dataset
import pickle
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from utils import process_config

In [None]:
dataset = load_dataset("wikitablequestions")

In [None]:
with open("configs/cell_highlighting/t5.json", "rb") as f:
    config = json.load(f)
config = process_config(config)

In [None]:
model = T5ModelForTableCellHighlighting(config)
model.load_state_dict(torch.load("logs/table_cell_highlighting_flan_t5_xl_pretrain/checkpoints/epoch=2.pt", map_location="cpu"))

In [None]:
model.to("cuda:0")

In [None]:
with open("datasets/test_wiki_tq_reason_without_answer.pkl", "rb") as f:
    reasons_list = pickle.load(f)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.tokenizer.tokenizer_path)

In [None]:
len(reasons_list)

In [None]:
def predict(idx):

    reason = reasons_list[idx]

    table = dataset["test"][idx]["table"]
    table_column_names = table["header"]
    table_content_values = table["rows"]

    table_column_names = [x.lower() for x in table_column_names]

    table = "[HEADER] " + " | ".join(table_column_names)
    for row_id, row in enumerate(table_content_values):
        row = [x.lower() for x in row]
        table += f" [ROW] {row_id}: " + " | ".join(row) 

    tokenized_input = tokenizer(reason, table, add_special_tokens = config.tokenizer.add_special_tokens,
                            padding = config.tokenizer.padding, truncation = config.tokenizer.truncation, 
                            max_length = config.tokenizer.input_max_length, return_tensors = config.tokenizer.return_tensors,
                            return_token_type_ids = config.tokenizer.return_token_type_ids,
                            return_attention_mask = config.tokenizer.return_attention_mask)


    output_ids = model.model.generate(input_ids = tokenized_input["input_ids"].to("cuda:0"), attention_mask = tokenized_input["attention_mask"].to("cuda:0"),
                                      max_new_tokens = config.tokenizer.output_max_length, num_beams = 3, early_stopping = True).squeeze().detach().cpu()

    predicted_cells = tokenizer.decode(output_ids, skip_special_tokens=True)

    return predicted_cells
    

In [None]:
highlighted_cells_list = []

In [None]:
from tqdm import tqdm

In [None]:
for i in tqdm(range(len(reasons_list)), position=0, leave = True, total = len(reasons_list)):
    x = predict(i).split(", ")
    x = [a.strip() for a in x]

    highlighted_cells_list.append(x)

In [None]:
len(highlighted_cells_list)

In [None]:
highlighted_cells_list[2059]

In [None]:
with open("datasets/wiki_tq_test_highlighted_cell_flant_t5_reasons.pkl", "wb") as f:
    pickle.dump(highlighted_cells_list, f)

## Find the character index

### Train set

In [None]:
from datasets import load_dataset
import pickle
from transformers import AutoTokenizer

In [None]:
train_dataset = load_dataset("wikitablequestions")["test"]

with open("datasets/wiki_tq_test_highlighted_cell_flant_t5_reasons.pkl", "rb") as f:
    train_highlighted_cells = pickle.load(f)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")

In [None]:
len(train_highlighted_cells)

In [None]:
import pandas as pd

In [None]:
hard_relevance_labels = []

In [None]:
from tqdm import tqdm

In [None]:
for i in tqdm(range(len(train_dataset)), position = 0, leave = True, total = len(train_dataset)):

    question = train_dataset[i]["question"]
    highlighted_cells = train_highlighted_cells[i]

    table = train_dataset[i]["table"]
    table_column_names = table["header"]    
    table_content_values = table["rows"]

    table_df = pd.DataFrame.from_dict({str(col).lower(): [str(table_content_values[j][i]).lower() for j in range(len(table_content_values))] for i, col in enumerate(table_column_names)})
    
    tokenized_input = tokenizer(table_df, question, add_special_tokens = config.tokenizer.add_special_tokens,
                            padding = config.tokenizer.padding, truncation = config.tokenizer.truncation, 
                            max_length = 960, return_tensors = config.tokenizer.return_tensors,
                            return_token_type_ids = config.tokenizer.return_token_type_ids,
                            return_attention_mask = config.tokenizer.return_attention_mask)
    
    tokenized_highlighted_cells = []
    hard_relevance_label = torch.zeros((tokenized_input["input_ids"].shape[1]))
    for h_cell in highlighted_cells:
        x = tokenizer(answer = h_cell, add_special_tokens = False,
                            return_tensors = config.tokenizer.return_tensors,
                            return_attention_mask = config.tokenizer.return_attention_mask)["input_ids"].tolist()
        for ele in x[0]:
            hard_relevance_label[tokenized_input["input_ids"].squeeze() == ele] = 1
        
    hard_relevance_labels.append(hard_relevance_label)

In [None]:
len(hard_relevance_labels)

In [None]:
hard_relevance_labels[2].sum()

In [None]:
import json
import pickle

In [None]:
with open("datasets/wiki_tq_test_highlighted_cell.pkl", "wb") as f:
    pickle.dump(hard_relevance_labels, f)

In [None]:
indices = []
start = 0
while start < len(s1):
    index = s1.find(s2, start)
    if index != -1:
        indices.append(index)
        start = index + 1
    else:
        break

In [None]:
predict(211)

In [None]:
predict(112)

# Cell highlighting on Sequential QA

In [None]:
import json
from src import T5ModelForTableCellHighlighting
from datasets import load_dataset
import pickle
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from utils import process_config

In [None]:
dataset = load_dataset("msr_sqa")

In [None]:
with open("configs/cell_highlighting/t5.json", "rb") as f:
    config = json.load(f)
config = process_config(config)

In [None]:
model = T5ModelForTableCellHighlighting(config)
model.load_state_dict(torch.load("logs/table_cell_highlighting_flan_t5_xl_pretrain/checkpoints/epoch=2.pt", map_location="cpu"))

In [None]:
model.to("cuda:0")

In [None]:
with open("datasets/test_seq_qa_reason_without_answer_flant5.pkl", "rb") as f:
    reasons_list = pickle.load(f)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.tokenizer.tokenizer_path)

In [None]:
len(reasons_list)

In [None]:
def predict(idx):

    reason = reasons_list[idx]

    table_column_names = dataset["test"][idx]["table_header"]
    table_content_values = dataset["test"][idx]["table_data"]

    table_column_names = [x.lower() for x in table_column_names]

    table = "[HEADER] " + " | ".join(table_column_names)
    for row_id, row in enumerate(table_content_values):
        row = [x.lower() for x in row]
        table += f" [ROW] {row_id}: " + " | ".join(row) 

    tokenized_input = tokenizer(reason, table, add_special_tokens = config.tokenizer.add_special_tokens,
                            padding = config.tokenizer.padding, truncation = config.tokenizer.truncation, 
                            max_length = config.tokenizer.input_max_length, return_tensors = config.tokenizer.return_tensors,
                            return_token_type_ids = config.tokenizer.return_token_type_ids,
                            return_attention_mask = config.tokenizer.return_attention_mask)


    output_ids = model.model.generate(input_ids = tokenized_input["input_ids"].to("cuda:0"), attention_mask = tokenized_input["attention_mask"].to("cuda:0"),
                                      max_new_tokens = config.tokenizer.output_max_length, num_beams = 3, early_stopping = True).squeeze().detach().cpu()

    predicted_cells = tokenizer.decode(output_ids, skip_special_tokens=True)

    return predicted_cells
    

In [None]:
from tqdm import tqdm
highlighted_cells_list = []

In [None]:
for i in tqdm(range(len(reasons_list)), position=0, leave = True, total = len(reasons_list)):
    x = predict(i).split(", ")
    x = [a.strip() for a in x]

    highlighted_cells_list.append(x)

In [None]:
with open("datasets/seq_qa_test_highlighted_cell_flant_t5_reasons.pkl", "wb") as f:
    pickle.dump(highlighted_cells_list, f)

In [None]:
with open("datasets/seq_qa_test_highlighted_cell_flant_t5_reasons.pkl", "rb") as f:
    highlighted_cells_list = pickle.load(f)

In [None]:
len(highlighted_cells_list)

In [None]:
hard_relevance_labels = []

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("neulab/omnitab-large")

In [None]:
train_dataset = dataset["test"]

In [None]:
import pandas as pd

In [None]:
for i in tqdm(range(len(train_dataset)), position = 0, leave = True, total = len(train_dataset)):

    question = " ".join(train_dataset[i]["question_and_history"])
    table_column_names = train_dataset[i]["table_header"]
    table_content_values = train_dataset[i]["table_data"]
    # question = train_dataset[i]["question"]
    highlighted_cells = highlighted_cells_list[i]

    # table = train_dataset[i]["table"]
    # table_column_names = table["header"]    
    # table_content_values = table["rows"]

    table_df = pd.DataFrame.from_dict({str(col).lower(): [str(table_content_values[j][i]).lower() for j in range(len(table_content_values))] for i, col in enumerate(table_column_names)})
    
    tokenized_input = tokenizer(table_df, question, add_special_tokens = config.tokenizer.add_special_tokens,
                            padding = config.tokenizer.padding, truncation = config.tokenizer.truncation, 
                            max_length = 896, return_tensors = config.tokenizer.return_tensors,
                            return_token_type_ids = config.tokenizer.return_token_type_ids,
                            return_attention_mask = config.tokenizer.return_attention_mask)
    
    tokenized_highlighted_cells = []
    hard_relevance_label = torch.zeros((tokenized_input["input_ids"].shape[1]))
    for h_cell in highlighted_cells:
        x = tokenizer(answer = h_cell, add_special_tokens = False,
                            return_tensors = config.tokenizer.return_tensors,
                            return_attention_mask = config.tokenizer.return_attention_mask)["input_ids"].tolist()
        for ele in x[0]:
            hard_relevance_label[tokenized_input["input_ids"].squeeze() == ele] = 1
        
    hard_relevance_labels.append(hard_relevance_label)


In [None]:
len(hard_relevance_labels)

In [None]:
hard_relevance_labels[100].sum()

In [None]:
with open("datasets/seq_qa_test_highlighted_cell.pkl", "wb") as f:
    pickle.dump(hard_relevance_labels, f)

# Seq QA Reason and Cell Quality

In [None]:
from datasets import load_dataset
import pickle
import pandas as pd

In [None]:
train_dataset = load_dataset("msr_sqa")["train"]

In [None]:
with open("datasets/seq_qa_reason_without_answer_flant5.pkl", "rb") as f:
    train_reasons = pickle.load(f)

In [None]:
with open("datasets/seq_qa_train_highlighted_cell_flant_t5_reasons.pkl", "rb") as f:
    train_highlighted_cells = pickle.load(f)

In [None]:
idx = 12

In [None]:
question = " ".join(train_dataset[idx]["question_and_history"])
table_column_names = train_dataset[idx]["table_header"]
table_content_values = train_dataset[idx]["table_data"]
answer_text = ", ".join(train_dataset[idx]["answer_text"])

table = pd.DataFrame.from_dict({str(col).lower(): [str(table_content_values[j][i]).lower() for j in range(len(table_content_values))] for i, col in enumerate(table_column_names)})
reason = train_reasons[idx]
highlighted_cells = train_highlighted_cells[idx]

print("Question: ", question, end = "\n\n")
print("Answer: ", answer_text, end = "\n\n")
print("Reason: ", reason, end = "\n\n")
print("Highlight: ", highlighted_cells, end = "\n\n")
display(table)

# Cell highlighting on FetaQA using flant5-xl reasons

In [None]:
import json
from src import T5ModelForTableCellHighlighting
from datasets import load_dataset
import pickle
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from utils import process_config

In [None]:
dataset = load_dataset("DongfuTingle/FeTaQA")

In [None]:
with open("configs/cell_highlighting/t5.json", "rb") as f:
    config = json.load(f)
config = process_config(config)

In [None]:
model = T5ModelForTableCellHighlighting(config)

In [None]:
model.load_state_dict(torch.load("logs/table_cell_highlighting_flan_t5_xl_pretrain/checkpoints/epoch=2.pt", map_location="cpu"))

In [None]:
model.to("cuda:0")

In [None]:
with open("datasets/test_feta_qa_reason_without_answer_flant5.pkl", "rb") as f:
    reasons_list = pickle.load(f)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.tokenizer.tokenizer_path)

In [None]:
def predict(idx):

    reason = reasons_list[idx]

    table_column_names = dataset["test"][idx]["table_array"][0]
    table_content_values = dataset["test"][idx]["table_array"][1:]

    table_column_names = [x.lower() for x in table_column_names]

    table = "[HEADER] " + " | ".join(table_column_names)
    for row_id, row in enumerate(table_content_values):
        row = [x.lower() for x in row]
        table += f" [ROW] {row_id}: " + " | ".join(row) 

    tokenized_input = tokenizer(reason, table, add_special_tokens = config.tokenizer.add_special_tokens,
                            padding = config.tokenizer.padding, truncation = config.tokenizer.truncation, 
                            max_length = config.tokenizer.input_max_length, return_tensors = config.tokenizer.return_tensors,
                            return_token_type_ids = config.tokenizer.return_token_type_ids,
                            return_attention_mask = config.tokenizer.return_attention_mask)


    output_ids = model.model.generate(input_ids = tokenized_input["input_ids"].to("cuda:0"), attention_mask = tokenized_input["attention_mask"].to("cuda:0"),
                                      max_new_tokens = config.tokenizer.output_max_length, num_beams = 3, early_stopping = True).squeeze().detach().cpu()

    predicted_cells = tokenizer.decode(output_ids, skip_special_tokens=True)

    return predicted_cells
    

In [None]:
highlighted_cells_list = []

In [None]:
from tqdm import tqdm

In [None]:
for i in tqdm(range(len(reasons_list)), position=0, leave = True, total = len(reasons_list)):
    x = predict(i).split(", ")
    x = [a.strip() for a in x]

    highlighted_cells_list.append(x)

In [None]:
with open("datasets/feta_qa_test_highlighted_cell_flant_t5_reasons.pkl", "wb") as f:
    pickle.dump(highlighted_cells_list, f)

In [None]:
hard_relevance_labels = []

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large")

In [None]:
train_dataset = dataset["test"]

In [None]:
from tqdm import tqdm
import pandas as pd

In [None]:
for i in tqdm(range(len(train_dataset)), position = 0, leave = True, total = len(train_dataset)):

    question = " ".join(train_dataset[i]["question"])
    table_column_names = train_dataset[i]["table_array"][0]
    table_content_values = train_dataset[i]["table_array"][1:]
    # question = train_dataset[i]["question"]
    highlighted_cells = highlighted_cells_list[i]

    # table = train_dataset[i]["table"]
    # table_column_names = table["header"]    
    # table_content_values = table["rows"]

    table_df = pd.DataFrame.from_dict({str(col).lower(): [str(table_content_values[j][i]).lower() for j in range(len(table_content_values))] for i, col in enumerate(table_column_names)})
    
    tokenized_input = tokenizer(table_df, question, add_special_tokens = config.tokenizer.add_special_tokens,
                            padding = config.tokenizer.padding, truncation = config.tokenizer.truncation, 
                            max_length = 896, return_tensors = config.tokenizer.return_tensors,
                            return_token_type_ids = config.tokenizer.return_token_type_ids,
                            return_attention_mask = config.tokenizer.return_attention_mask)
    
    tokenized_highlighted_cells = []
    hard_relevance_label = torch.zeros((tokenized_input["input_ids"].shape[1]))
    for h_cell in highlighted_cells:
        x = tokenizer(answer = h_cell, add_special_tokens = False,
                            return_tensors = config.tokenizer.return_tensors,
                            return_attention_mask = config.tokenizer.return_attention_mask)["input_ids"].tolist()
        for ele in x[0]:
            hard_relevance_label[tokenized_input["input_ids"].squeeze() == ele] = 1
    
    hard_relevance_labels.append(hard_relevance_label)


In [None]:
with open("datasets/feta_qa_test_highlighted_cell.pkl", "wb") as f:
    pickle.dump(hard_relevance_labels, f)