In [None]:
import os
os.chdir("../")

## MLM Inference

In [None]:

import os
import numpy as np
import torch
import torch.nn as nn
import argparse
import json
from utils import process_config

from datasets import load_dataset

from utils import set_seed
import pickle


from src import TapexModelForConditionalGeneration, TapexModelForMaskedLanguageModelling
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger

from utils import prepare_dataloaders
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings
warnings.filterwarnings("ignore")

from data import SciGenDataset

In [None]:
with open("configs/tapex_baseline_mlm.json", "r") as f:
    config = json.load(f)

In [None]:
config = process_config(config, args = None)

In [None]:
set_seed(config.seed)

In [None]:
dataset = load_dataset(config.data.data_path)

In [None]:
if config.data.name == "scigen":
    train_dataset = SciGenDataset(dataset, config, "train")
    validation_dataset = SciGenDataset(dataset, config, "validation")
    test_dataset = SciGenDataset(dataset, config, "test")

In [None]:
tokenizer = train_dataset.tokenizer

In [None]:
if config.training.training_type == "masked_language_modelling":
    model = TapexModelForMaskedLanguageModelling.load_from_checkpoint(config.model.checkpoint)
else:
    model = TapexModelForConditionalGeneration.load_from_checkpoint(config.model.checkpoint)

In [None]:
def predict(index, data_type = "test"):

    if data_type == "train":
        data = train_dataset
    elif data_type == "val":
        data = validation_dataset
    elif data_type == "test":
        data = test_dataset

    input_ids, attention_mask, token_type_ids, output_ids = data.__getitem__(index)
    actual_text = data.text_input[index]
    actual_table = data.table[index]
    output_text = data.text_output[index]

    masked_indices = (output_ids != -100)

    output_ids_actual = tokenizer(answer = output_text, add_special_tokens = config.tokenizer.add_special_tokens,
                                    padding = config.tokenizer.padding, truncation = config.tokenizer.truncation, 
                                    max_length = config.tokenizer.max_length, return_tensors = config.tokenizer.return_tensors)

    actual_output = tokenizer.decode(output_ids_actual["input_ids"][0])

    output = model(input_ids.unsqueeze(0).cuda(0), attention_mask.unsqueeze(0).cuda(0), token_type_ids.unsqueeze(0).cuda(0), output_ids.unsqueeze(0).cuda(0))
    logits = output['logits'].detach().cpu()

    predicted_ids = logits.argmax(-1)[0]
    output_ids_actual["input_ids"][0][masked_indices] = predicted_ids[masked_indices]

    predicted_output = tokenizer.decode(output_ids_actual["input_ids"][0])

    masked_input = tokenizer.decode(input_ids)

    print(f"Actual table caption: \t {actual_text.split('</s>')[0]}")
    print(f"Actual text: \t\t {actual_text.split('</s>')[1]}")
    print(f"Masked input (no cap): \t {masked_input.split('</s>')[1]}")
    print(f"Actual Output: \t\t {actual_output}")
    print(f"Predicted output: \t {predicted_output}")
    print(f"Input Table:")
    display(actual_table)

In [None]:
predict(150, "test")

In [None]:
predict(5, "test")

In [None]:
predict(30, "test")

In [None]:
predict(200, "test")

In [None]:
predict(123, "test")

In [None]:
from src import compute_metrics

In [None]:
index = 150

In [None]:
input_ids, attention_mask, token_type_ids, output_ids = test_dataset.__getitem__(index)  

In [None]:
output_ids

In [None]:

input_ids, attention_mask, token_type_ids, output_ids = test_dataset.__getitem__(index)    
actual_text = test_dataset.text_input[index]
actual_table = test_dataset.table[index]

output = model(input_ids.unsqueeze(0).cuda(0), attention_mask.unsqueeze(0).cuda(0), token_type_ids.unsqueeze(0).cuda(0), output_ids.unsqueeze(0).cuda(0))
logits = output['logits'].detach().cpu()

predicted_ids = logits.argmax(-1)[0]
predicted_output = tokenizer.decode(predicted_ids)

masked_input = tokenizer.decode(input_ids)

print(f"Actual table caption: \t {actual_text.split('</s>')[0]}")
print(f"Actual text: \t\t {actual_text.split('</s>')[1]}")
print(f"Masked input (no cap): \t {masked_input.split('</s>')[1]}")
print(f"Predicted output: \t {predicted_output}")
print(f"Input Table:")
display(actual_table)

print(compute_metrics(logits.squeeze(), output_ids, tokenizer = tokenizer, config = config))

## Generation Inference

In [None]:

import os
import numpy as np
import torch
import torch.nn as nn
import argparse
import json
from utils import process_config

from datasets import load_dataset

from utils import set_seed
import pickle


from src import TapexModelForConditionalGeneration, TapexModelForMaskedLanguageModelling
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger

from utils import prepare_dataloaders
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings
warnings.filterwarnings("ignore")

from data import SciGenDataset

In [None]:
with open("configs/tapex_baseline.json", "r") as f:
    config = json.load(f)

In [None]:
config = process_config(config, args = None)

In [None]:
set_seed(config.seed)

In [None]:
dataset = load_dataset(config.data.data_path)

In [None]:
if config.data.name == "scigen":
    train_dataset = SciGenDataset(dataset, config, "train")
    validation_dataset = SciGenDataset(dataset, config, "validation")
    test_dataset = SciGenDataset(dataset, config, "test")

In [None]:
tokenizer = train_dataset.tokenizer

In [None]:
if config.training.training_type == "masked_language_modelling":
    model = TapexModelForMaskedLanguageModelling.load_from_checkpoint(config.model.checkpoint)
else:
    model = TapexModelForConditionalGeneration.load_from_checkpoint(config.model.checkpoint)

In [None]:
def predict(index, data_type = "test"):
    
    if data_type == "train":
        input_ids, attention_mask, token_type_ids, output_ids = train_dataset.__getitem__(index)
        actual_text_input = train_dataset.text_input[index]
        actual_table = train_dataset.table[index]
        actual_text_output = train_dataset.text_output[index]
    elif data_type == "val":
        input_ids, attention_mask, token_type_ids, output_ids = validation_dataset.__getitem__(index)     
        actual_text_input = validation_dataset.text_input[index]
        actual_table = validation_dataset.table[index]
        actual_text_output = validation_dataset.text_output[index]
    elif data_type == "test":
        input_ids, attention_mask, token_type_ids, output_ids = test_dataset.__getitem__(index)    
        actual_text_input = test_dataset.text_input[index]
        actual_table = test_dataset.table[index]
        actual_text_output = test_dataset.text_output[index]

    output = model(input_ids.unsqueeze(0).cuda(0), attention_mask.unsqueeze(0).cuda(0), token_type_ids.unsqueeze(0).cuda(0), output_ids.unsqueeze(0).cuda(0))
    logits = output['logits'].detach().cpu()
    
    predicted_ids = logits.argmax(-1)[0]
    predicted_output = tokenizer.decode(predicted_ids)

    print(f"Actual text input: \t {actual_text_input}")
    print(f"Actual_text_output: \t {actual_text_output}")
    print(f"Predicted output: \t {predicted_output}")
    print(f"Input Table:")
    display(actual_table)

In [None]:
predict(20, "test")

## Fact Verification

In [None]:

import os
import numpy as np
import torch
import torch.nn as nn
import argparse
import json
from utils import process_config

from datasets import load_dataset

from utils import set_seed
import pickle


from src import TapexModelForConditionalGeneration, TapexModelForMaskedLanguageModelling, TapexModelForSequenceClassification
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger

from utils import prepare_dataloaders
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings
warnings.filterwarnings("ignore")

from data import SciGenDataset, TabFactDataset

In [None]:
with open("configs/tapex_baseline_tabfact.json", "r") as f:
    config = json.load(f)

In [None]:
config = process_config(config, args = None)

In [None]:
set_seed(config.seed)

In [None]:
dataset = load_dataset(config.data.data_path, config.data.config_name)

In [None]:
if config.data.name == "tabfact":
    train_dataset = TabFactDataset(dataset, config, "train")
    validation_dataset = TabFactDataset(dataset, config, "validation")
    test_dataset = TabFactDataset(dataset, config, "test")

In [None]:
tokenizer = train_dataset.tokenizer

In [None]:
model = TapexModelForSequenceClassification.load_from_checkpoint("experiment_dir/factver/factver_v1_mlm/checkpoints/epoch=2-step=8652.ckpt")

In [None]:
def predict(index, data_type = "test"):

    if data_type == "train":
        data = train_dataset
    elif data_type == "val":
        data = validation_dataset
    elif data_type == "test":
        data = test_dataset

    input_ids, attention_mask, token_type_ids, label = data.__getitem__(index)
    actual_text = data.text_input[index]
    actual_table = data.table[index]

    output = model(input_ids.unsqueeze(0).cuda(0), attention_mask.unsqueeze(0).cuda(0), token_type_ids.unsqueeze(0).cuda(0), label.unsqueeze(0).cuda(0))
    logits = output['logits'].detach().cpu()

    pred = logits.argmax(-1)[0]

    # print(f"Actual text: \t\t {actual_text}")
    # print(f"Actual Output: \t\t {label}")
    # print(f"Predicted output: \t {pred}")
    # print(f"Input Table:")
    # display(actual_table)

    return actual_text, label, pred

In [None]:
predict(110, "test")

In [None]:
predict(50, "test")

In [None]:
predict(87, "test")

In [None]:
predict(200, "test")

In [None]:
test_dataset.__len__()

In [None]:
test_dataset.text_input[2019]

In [None]:
from tqdm import tqdm

In [None]:
error_fact_len = []
correct_fact_len = []
for index in tqdm(range(test_dataset.__len__()), position = 0, leave = True, total = test_dataset.__len__()):
    actual_text, label, pred = predict(index, "test")
    if label != pred:
        error_fact_len.append(len(actual_text.split("</s>")[0].split()))
    else: 
        correct_fact_len.append(len(actual_text.split("</s>")[0].split()))

In [None]:
len(error_fact_len)

In [None]:
np.mean(error_fact_len)

In [None]:
np.max(error_fact_len)

In [None]:
np.min(error_fact_len)

In [None]:
len(correct_fact_len)

In [None]:
np.mean(correct_fact_len)

In [None]:
np.max(correct_fact_len)

In [None]:
np.min(correct_fact_len)

## Column Reasoning

## Generative MLM

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
import argparse
import json
from utils import process_config

from datasets import load_dataset

from utils import set_seed, create_synthetic_column
import pickle

import sys


from data import SciGenDataset

from utils import prepare_dataloaders
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings
warnings.filterwarnings("ignore")

from utils import Trainer, Logger


from src import BartModelForMaskedLM, BartModelForConditionalGeneration, BartModelForSequenceClassification

In [None]:
with open("configs/tapex_baseline_mlm.json", "r") as f:
    config = json.load(f)

In [None]:
config = process_config(config, args = None)

In [None]:
set_seed(config.seed)

In [None]:
dataset = load_dataset(config.data.data_path)

In [None]:
if config.data.name == "scigen":
    # train_dataset = SciGenDataset(dataset, config, "train")
    # validation_dataset = SciGenDataset(dataset, config, "validation")
    test_dataset = SciGenDataset(dataset, config, "test")

In [None]:
torch.load(config.model.checkpoint)

In [None]:
tokenizer = test_dataset.tokenizer
model = BartModelForMaskedLM(config)
model = nn.DataParallel(model)
model.load_state_dict(torch.load(config.model.checkpoint))

In [None]:
model.to("cuda:0")

In [None]:
def predict(index, data_type = "test"):

    # if data_type == "train":
    #     data = train_dataset
    # elif data_type == "val":
    #     data = validation_dataset
    # elif data_type == "test":
    data = test_dataset

    input_ids, attention_mask, token_type_ids, decoder_input_ids, labels = data.__getitem__(index)
    
    actual_text = data.text_input[index]
    actual_table = data.table[index]
    output_text = data.text_output[index]

    logits = model(input_ids = input_ids.unsqueeze(0).to("cuda:0"), 
                                    attention_mask = attention_mask.unsqueeze(0).to("cuda:0"), 
                                    decoder_input_ids = decoder_input_ids.unsqueeze(0).to("cuda:0")).squeeze().detach().cpu()

    # predicted_words = model.module.model.generate(inputs = input_ids.unsqueeze(0).to("cuda:0"), num_beams=4, num_return_sequences=4).detach().cpu().squeeze()
    # print(predicted_words)

    print(f"Actual text: {actual_text}")
    print(f"Masked text: {tokenizer.batch_decode(input_ids.unsqueeze(0))}")
    print(f"Output words: {tokenizer.batch_decode(labels[labels != -100].unsqueeze(0))}")
    print(f"Predicted words: {tokenizer.batch_decode(logits.argmax(-1)[labels != -100].unsqueeze(0))}")
    # print(f"Predicted words: {tokenizer.batch_decode(predicted_words, skip_special_tokens=True)}")
    print(f"Table: ")
    display(actual_table)
    
    # print(logits.shape)

    

    # # logits = model(input_ids.unsqueeze(0).cuda(0), attention_mask.unsqueeze(0).cuda(0), token_type_ids.unsqueeze(0).cuda(0), output_ids.unsqueeze(0).cuda(0))

    # masked_indices = (output_ids != -100)

    # output_ids_actual = tokenizer(answer = output_text, add_special_tokens = config.tokenizer.add_special_tokens,
    #                                 padding = config.tokenizer.padding, truncation = config.tokenizer.truncation, 
    #                                 max_length = config.tokenizer.max_length, return_tensors = config.tokenizer.return_tensors)

    # actual_output = tokenizer.decode(output_ids_actual["input_ids"][0])

    # output = model(input_ids.unsqueeze(0).cuda(0), attention_mask.unsqueeze(0).cuda(0), token_type_ids.unsqueeze(0).cuda(0), output_ids.unsqueeze(0).cuda(0))
    # logits = output['logits'].detach().cpu()

    # predicted_ids = logits.argmax(-1)[0]
    # output_ids_actual["input_ids"][0][masked_indices] = predicted_ids[masked_indices]

    # predicted_output = tokenizer.decode(output_ids_actual["input_ids"][0])

    # masked_input = tokenizer.decode(input_ids)

    # print(f"Actual table caption: \t {actual_text.split('</s>')[0]}")
    # print(f"Actual text: \t\t {actual_text.split('</s>')[1]}")
    # print(f"Masked input (no cap): \t {masked_input.split('</s>')[1]}")
    # print(f"Actual Output: \t\t {actual_output}")
    # print(f"Predicted output: \t {predicted_output}")
    # print(f"Input Table:")
    # display(actual_table)

In [None]:
predict(5, "test")

In [None]:
predict(10, "test")

In [None]:
predict(11, "test")

In [None]:
predict(12, "test")

## Dolly Inference (Verbalised and Non verbalised)

In [None]:
from transformers import GPTNeoXForCausalLM

In [None]:
model = GPTNeoXForCausalLM.from_pretrained("databricks/dolly-v2-3b")

In [None]:

import os
import numpy as np
import torch
import torch.nn as nn
import argparse
import json
from utils import process_config

from datasets import load_dataset

from utils import set_seed
import pickle


from src import TapexModelForConditionalGeneration, TapexModelForMaskedLanguageModelling
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger

from utils import prepare_dataloaders
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings
warnings.filterwarnings("ignore")

from data import SciGenDataset, TabFactDataset

In [None]:
with open("configs/dolly_tab_fact.json", "r") as f:
    config = json.load(f)

In [None]:
config = process_config(config, args = None)

In [None]:
set_seed(config.seed)

In [None]:
dataset = load_dataset(config.data.data_path, config.data.data_path)

In [None]:
if config.data.name == "tabfact":
    # train_dataset = SciGenDataset(dataset, config, "train")
    # validation_dataset = SciGenDataset(dataset, config, "validation")
    test_dataset = TabFactDataset(dataset, config, "test")

In [None]:
tokenizer = test_dataset.tokenizer

In [None]:
def decompose_table(table):
    pass

In [None]:
from copy import deepcopy

In [None]:
model.to("cuda:0")

In [None]:
model = model.cpu().to("cuda:1")

In [None]:
def inference(index, verbalise = False, decompose = False):

    text = test_dataset.text_input[index]
    label = test_dataset.label[index]
    table = test_dataset.table[index]

    display(table)

    if verbalise:
        table_copy = deepcopy(table)
        list_of_lists = table_copy.values.tolist()
        column_names = table_copy.columns.tolist()
        resList = [column_names]+list_of_lists
        # print(resList)

        num_rows = len(resList)
        num_cols = len(resList[0])
        column_names =  resList[0]
        table = "[HEADER] " + " ".join(column_names)
        # y = tokenizer.special_tokens_map['sep_token']
        z = tokenizer.special_tokens_map['eos_token']
        table += " ".join(
        [
            
            f" [ROW] {' '.join([f'Cell ({row},{col+1}) has {resList[row][col]} {z}' for col in range(num_cols)])}" if row != (num_rows - 1)
            
            else f" [ROW] {' '.join([f'Cell ({row},{col+1}) has {resList[row][col]} {z}' for col in range(num_cols)])}" for row in range(1, num_rows)
            
        ]
        ) 

    else:
        table_column_names = list(table.columns)
        table_content_values = table.values.tolist()
        table = "[HEADER] " + " ".join(table_column_names)
        for row in table_content_values:
            table += " [ROW] " + " ".join(row) 



    text_input = f"State whether the following fact is correct using the table with proper reason <s> {text} <s> {table}"
    print(f"Text input: {text_input}")
    tokenized_input = tokenizer(text_input, return_tensors="pt")

    # print(tokenized_input["input_ids"])
    # return
    model.eval()
    with torch.no_grad():
        output = model.generate(tokenized_input["input_ids"].to("cuda:1"), num_beams = 3, max_new_tokens = 30)

    print(f"Fact: {text}")
    print(f"Label: {label}")
    # print(output)
    print(tokenizer.batch_decode(output))


In [None]:
torch.cuda.empty_cache()

In [None]:
inference(0, verbalise = False)

In [None]:
inference(1, verbalise = False)

In [None]:
inference(0, verbalise = True)

In [None]:
inference(7, verbalise = True)

In [None]:
inference(7, verbalise = False)

In [None]:
inference(10, verbalise = False)

In [None]:
inference(10, verbalise = True)