## Installing libraries, loading weights, config and vocab files

Here we install our Tapas implementation, as well as the `torch-scatter` dependency library.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')
import sys
import os
import numpy as np
# from transformers import TapasTokenizer, TapasForQuestionAnswering,BertTokenizer,TapasConfig, AdamW

prefix = '/content/gdrive/My Drive/'
# modify "customized_path_to_your_homework" here to where you uploaded your homework
customized_path_to_your_homework = 'VT-1/Intro DL/Final Project/My Exp'
sys_path = os.path.join(prefix, customized_path_to_your_homework)
sys.path.append(sys_path)

Mounted at /content/gdrive/


In [None]:
! rm -r transformers
! git clone -b tapas_v4_debugging_backward_pass https://github.com/NielsRogge/transformers.git
! cd transformers
! pip install ./transformers

rm: cannot remove 'transformers': No such file or directory
Cloning into 'transformers'...
remote: Enumerating objects: 71336, done.[K
remote: Counting objects: 100% (1563/1563), done.[K
remote: Compressing objects: 100% (468/468), done.[K
remote: Total 71336 (delta 962), reused 1411 (delta 874), pack-reused 69773[K
Receiving objects: 100% (71336/71336), 53.89 MiB | 23.04 MiB/s, done.
Resolving deltas: 100% (50318/50318), done.
Processing ./transformers
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/fb/36/59e4a62254c5fcb43894c6b0e9403ec6f4238cc2422a003ed2e6279a1784/tokenizers-0.9.4-cp37-cp37m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 5.1MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc

In [None]:
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html


Looking in links: https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
Collecting torch-scatter
[?25l  Downloading https://pytorch-geometric.com/whl/torch-1.8.0%2Bcu101/torch_scatter-2.0.6-cp37-cp37m-linux_x86_64.whl (2.5MB)
[K     |████████████████████████████████| 2.6MB 2.5MB/s 
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.0.6


## Preparing the data for the model using TapasTokenizer

Let's read in a collection of 10 table-question pairs from the WTQ test set, on which we will further fine-tune TAPAS.

In [None]:
import pandas as pd

data = pd.read_excel(sys_path+"/Data/Test_5.xlsx")
data=data[24:]
data

Unnamed: 0,id,annotator,position,question,table_file,answer_coordinates,answer_text,aggregation,float_answer
24,nu-21,0.0,0.0,Number of patients in ICU currently in Virginia?,All_States.csv,"['(6,2)']",['258'],COUNT,258.0


In [None]:
# data_NC = pd.read_csv(sys_path+"/Data/NC.csv")
# data_NC=data_NC[:3]
# data_NY = pd.read_csv(sys_path+"/Data/NY.csv")
# data_NY=data_NY[:3]
# data_VA = pd.read_csv(sys_path+"/Data/VA.csv")
# data_VA=data_VA[:3]
# # data_US = pd.read_csv(sys_path+"/Data/US.csv")
# # data_US=data_US[:3]
# df_all=pd.concat([data_NC,data_NY,data_VA])
# df_all.to_csv(sys_path+'/Data/All_States.csv')

Here we make sure that the `answer_coordinates` and `answer_text` columns are converted into true Python lists of tuples/strings respectively.

In [None]:
import ast

def _parse_answer_coordinates(answer_coordinate_str):
  """Parses the answer_coordinates of a question.
  Args:
    answer_coordinate_str: A string representation of a Python list of tuple
      strings.
      For example: "['(1, 4)','(1, 3)', ...]"
  """

  try:
    answer_coordinates = []
    # make a list of strings
    coords = ast.literal_eval(answer_coordinate_str)
    # parse each string as a tuple
    for row_index, column_index in sorted(
        ast.literal_eval(coord) for coord in coords):
      answer_coordinates.append((row_index, column_index))
  except SyntaxError:
    raise ValueError('Unable to evaluate %s' % answer_coordinate_str)
  
  return answer_coordinates


def _parse_answer_text(answer_text):
  """Populates the answer_texts field of `answer` by parsing `answer_text`.
  Args:
    answer_text: A string representation of a Python list of strings.
      For example: "[u'test', u'hello', ...]"
    answer: an Answer object.
  """
  try:
    answer = []
    for value in ast.literal_eval(answer_text):
      answer.append(value)
  except SyntaxError:
    raise ValueError('Unable to evaluate %s' % answer_text)

  return answer

data['answer_coordinates'] = data['answer_coordinates'].apply(lambda coords_str: _parse_answer_coordinates(coords_str))
data['answer_text'] = data['answer_text'].apply(lambda txt: _parse_answer_text(txt))

data.head()

Unnamed: 0,id,annotator,position,question,table_file,answer_coordinates,answer_text,aggregation,float_answer
24,nu-21,0.0,0.0,Number of patients in ICU currently in Virginia?,All_States.csv,"[(6, 2)]",[258],COUNT,258.0


Next, we initialize the tokenizer, which can be used to prepare the data for the model.

In [None]:
from transformers import TapasTokenizer

tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")

Here we create a PyTorch dataset and corresponding dataloader. We encode each table-question pair independently using the tokenizer.

In [None]:
import torch

table_csv_path = sys_path+'/Data/'

class TableDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        item = data.iloc[idx]
        table = pd.read_csv(table_csv_path + item.table_file).astype(str)
        cols=['state','death','inIcuCurrently','hospitalizedCurrently']
        table=table[cols][:9]
        encoding = self.tokenizer(table=table, 
                                  queries=item.question, 
                                  answer_coordinates=item.answer_coordinates, 
                                  answer_text=item.answer_text,
                                  padding="max_length",
                                  truncation=True,
                                  return_tensors="pt"
        )
        # remove the batch dimension which the tokenizer adds 
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        # add float answer (weak supervision for aggregation)
        encoding["float_answer"] = torch.tensor(item.float_answer)
        return encoding

    def __len__(self):
        return len(self.data)

train_dataset = TableDataset(data, tokenizer)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=1)

In [None]:
print(data.table_file)


24    All_States.csv
Name: table_file, dtype: object


In [None]:
table = pd.read_csv(table_csv_path + data.table_file[24]).astype(str)
print(table.columns)
print(data.table_file[24])
print(data.question[24])
cols=['state','death','inIcuCurrently','hospitalizedCurrently']

table=table[cols][:9]
print(table)

Index(['Unnamed: 0', 'date', 'state', 'death', 'deathConfirmed',
       'deathIncrease', 'deathProbable', 'hospitalized',
       'hospitalizedCumulative', 'hospitalizedCurrently',
       'hospitalizedIncrease', 'inIcuCumulative', 'inIcuCurrently', 'negative',
       'negativeIncrease', 'negativeTestsAntibody',
       'negativeTestsPeopleAntibody', 'negativeTestsViral',
       'onVentilatorCumulative', 'onVentilatorCurrently', 'positive',
       'positiveCasesViral', 'positiveIncrease', 'positiveScore',
       'positiveTestsAntibody', 'positiveTestsAntigen',
       'positiveTestsPeopleAntibody', 'positiveTestsPeopleAntigen',
       'positiveTestsViral', 'recovered', 'totalTestEncountersViral',
       'totalTestEncountersViralIncrease', 'totalTestResults',
       'totalTestResultsIncrease', 'totalTestsAntibody', 'totalTestsAntigen',
       'totalTestsPeopleAntibody', 'totalTestsPeopleAntigen',
       'totalTestsPeopleViral', 'totalTestsPeopleViralIncrease',
       'totalTestsViral', 'tot

In [None]:
# ['hospitalizedCurrently','inIcuCurrently','death']

In [None]:
tokenizer.decode(train_dataset[0]["input_ids"])

'[CLS] number of patients in icu currently in virginia? [SEP] state death inicucurrently hospitalizedcurrently nc 11502. 0 309. 0 1179. 0 nc 11502. 0 309. 0 1179. 0 nc 11446. 0 314. 0 1226. 0 ny 39029. 0 999. 0 4789. 0 ny 38970. 0 1012. 0 4954. 0 ny 38891. 0 1030. 0 5034. 0 va 9596. 0 258. 0 1127. 0 va 9519. 0 263. 0 1164. 0 va 9428. 0 254. 0 1222. 0 [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

## Fine-tuning TapasForQuestionAnswering

We can start from the already fine-tuned checkpoint:

In [None]:
from transformers import TapasForQuestionAnswering, AdamW

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq",
                                                  answer_loss_cutoff=None)
                                                  
model.to(device)

optimizer = AdamW(model.parameters(), lr=0.0000193581)

In [None]:
batch = next(iter(train_dataloader))
batch["input_ids"].shape

torch.Size([1, 512])

In [None]:
for epoch in range(10):  # loop over the dataset multiple times
    print("-------------")
    print("Epoch:", epoch)
    for idx, batch in enumerate(train_dataloader):
         print('Example:', idx)
         # get the inputs;
         input_ids = batch['input_ids'].to(device)
         attention_mask = batch['attention_mask'].to(device)
         token_type_ids = batch['token_type_ids'].to(device)
         labels = batch['labels'].to(device)
         numeric_values = batch['numeric_values'].to(device)
         numeric_values_scale = batch['numeric_values_scale'].to(device)
         float_answer = batch['float_answer'].to(device)

         #print(label_ids.size())
         
         # zero the parameter gradients
         optimizer.zero_grad()

         # forward + backward + optimize
         outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                        labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale,
                        float_answer=float_answer)
         loss = outputs.loss
         print(f"Loss: {loss.item()}")
         loss.backward()

        #  print(model.column_output_weights.grad) 
        #  print(model.aggregation_classifier.weight.grad)
        #  print(model.aggregation_classifier.bias.grad)

         optimizer.step()

-------------
Epoch: 0
Example: 0
Selection loss per example:
tensor([2.4584], grad_fn=<AddBackward0>)
Expected result:
tensor([254.0000], grad_fn=<SumBackward1>)
Per example answer loss scaled:
tensor([0.4774], dtype=torch.float64, grad_fn=<MulBackward0>)
Large answer loss mask:
tensor([1.])
Loss: 0.47743016481399536
-------------
Epoch: 1
Example: 0
Selection loss per example:
tensor([2.8765], grad_fn=<AddBackward0>)
Expected result:
tensor([266.7360], grad_fn=<SumBackward1>)
Per example answer loss scaled:
tensor([1.0514], dtype=torch.float64, grad_fn=<MulBackward0>)
Large answer loss mask:
tensor([1.])
Loss: 1.051405906677246
-------------
Epoch: 2
Example: 0
Selection loss per example:
tensor([3.7015], grad_fn=<AddBackward0>)
Expected result:
tensor([258.3333], grad_fn=<SumBackward1>)
Per example answer loss scaled:
tensor([0.0331], dtype=torch.float64, grad_fn=<MulBackward0>)
Large answer loss mask:
tensor([1.])
Loss: 0.03305523842573166
-------------
Epoch: 3
Example: 0
Selectio

## Inference

Let's investigate the predictions on the examples on which we just fine-tuned the model (from the WTQ test set).

In [None]:
item = data.iloc[0]
table = pd.read_csv(table_csv_path + item.table_file).astype(str)
cols=['state','death','inIcuCurrently','hospitalizedCurrently']

table=table[cols][:9]
encoding = tokenizer(table=table, 
                          queries=item.question, 
                          answer_coordinates=item.answer_coordinates, 
                          answer_text=item.answer_text,
                          truncation=True,
                          padding="max_length",
                          return_tensors="pt"
)
encoding["float_answer"] = torch.tensor(item.float_answer).unsqueeze(0)

In [None]:
encoding = {k: v.to(device) for k,v in encoding.items()}
outputs = model(**encoding)

Selection loss per example:
tensor([9.5816], grad_fn=<AddBackward0>)
Expected result:
tensor([258.3333], grad_fn=<SumBackward1>)
Per example answer loss scaled:
tensor([0.0331], dtype=torch.float64, grad_fn=<MulBackward0>)
Large answer loss mask:
tensor([1.])


In [None]:
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(encoding, outputs.logits, outputs.logits_aggregation)

In [None]:
predicted_answer_coordinates

[[(6, 2), (7, 2), (8, 2)]]

In [None]:
predicted_aggregation_indices

[2]

We can also do inference all at once:

In [None]:
for i in range(len(train_dataset)):  
  item = data.iloc[i]
  table = pd.read_csv(table_csv_path + item.table_file).astype(str)
  cols=['state','death','inIcuCurrently','hospitalizedCurrently']

  table=table[cols][:9]
  encoding = tokenizer(table=table, 
                            queries=item.question, 
                            truncation=True,
                            padding="max_length",
                            return_tensors="pt"
  )
  encoding = {k: v.to(device) for k,v in encoding.items()}
  # forward pass to get the logits
  outputs = model(**encoding)
  # use TapasTokenizer's function to convert them to predicted answer coordinates and aggregation indices
  predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(encoding, outputs.logits, 
                                                                                                        outputs.logits_aggregation)
  aggregation2idx = {0: 'NONE', 1:'SUM', 2:'AVERAGE', 3:'COUNT'}

  # print the result!
  display(table)
  print("")
  print(item.question)
  print(f"Predicted aggregation: {aggregation2idx[predicted_aggregation_indices[0]]}")
  print("Predicted cell values:")
  answers = ', '.join([table.iat[coord] for coord in predicted_answer_coordinates[0]])
  print(answers)
  print("------")

Selection loss per example:
tensor([10016.5762], grad_fn=<AddBackward0>)


Unnamed: 0,state,death,inIcuCurrently,hospitalizedCurrently
0,NC,11502.0,309.0,1179.0
1,NC,11502.0,309.0,1179.0
2,NC,11446.0,314.0,1226.0
3,NY,39029.0,999.0,4789.0
4,NY,38970.0,1012.0,4954.0
5,NY,38891.0,1030.0,5034.0
6,VA,9596.0,258.0,1127.0
7,VA,9519.0,263.0,1164.0
8,VA,9428.0,254.0,1222.0



Number of patients in ICU currently in Virginia?
Predicted aggregation: AVERAGE
Predicted cell values:
258.0, 263.0, 254.0
------
