## Environment Setup
Import key libraries and working envorinments.

In [None]:
import os
!pip install -q git+https://github.com/huggingface/transformers.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m86.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [None]:
# Importing the libraries needed
import pandas as pd
import numpy as np
import json, pickle
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer,BertModel, BertForPreTraining

import logging
logging.basicConfig(level=logging.ERROR)
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

## Loading the training and validation datasets

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd

# Authenticate
drive = None
def authenticate():
  global drive

  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)

#Download files
def downloadFiles(fileIds):
  authenticate()

  for fileId in fileIds:

    downloaded = drive.CreateFile({"id": fileId[1]})
    downloaded.GetContentFile(fileId[0])

Load the dataset directly, this is same as the datasets provided by the PDFVQA kaggle competition.

In [None]:
# Loading Training dataframe and pickle file
try:
  _ = open("train_dataframe.csv", "r")
except:
  downloadFiles([["train_dataframe.csv", "1-hU0-rt31_ZUZcTRZ3tXuUL0d_L5ALQM"]])

try:
  _ = open("train_doc_info.pkl", "r")
except:
  downloadFiles([["train_doc_info.pkl", "1-AhYdD-FvbgfgGvMsi8u4eVlGi5Y2M7V"]])

# Loading Validation dataframe and pickle file
try:
  _ = open("val_dataframe.csv", "r")
except:
  downloadFiles([["val_dataframe.csv", "1-xOr1-cCel-LPd75n5W68e_gptzFquWB"]])

try:
  _ = open("val_doc_info.pkl", "r")
except:
  downloadFiles([["val_doc_info.pkl", "1-az6pNeteeUaPlFCiEzkz73Z5G3rVG-m"]])

# Loading Testing dataframe and pickle file
try:
  _ = open("test_dataframe.csv", "r")
except:
  downloadFiles([["test_dataframe.csv", "1-searmQar_snnf3Aiz5dEmUDKZDHQj_c"]])
try:
  _ = open("test_doc_info.pkl", "r")
except:
  downloadFiles([["test_doc_info.pkl", "1-PZGZ8yhgeIdbYELmD9TbEJGD6hsPQlp"]])

Load the extract features. For feature extracting code, please refer the feature extraction tutorial. You can use visual feature extractor to extract visual feature and do similar thing to add visual feature into your framework.

In [None]:
try:
  _ = open("train_bert_cls.pkl", "r")
except:
  downloadFiles([["train_bert_cls.pkl", "1-2BpxbwzZEhULlYxP2-1DsEo84DyyqyU"]])
try:
  _ = open("val_bert_cls.pkl", "r")
except:
  downloadFiles([["val_bert_cls.pkl", "1Hvd-N2fMfxl1iE0wM6Fv4qtR87TKf9hK"]])

try:
  _ = open("test_bert_cls.pkl", "r")
except:
  downloadFiles([["test_bert_cls.pkl", "1-8V_buBKca12w_IrO16ZFUsJA6Vfsgpk"]])

In [None]:
try:
  _ = open("val_visual_feats.pkl", "r")
except:
  downloadFiles([["val_visual_feats.pkl", "1Z9umISob9ar_5n5T-Cbhr4nbHuQCvVGm"]])
try:
  _ = open("test_visual_feats.pkl", "r")
except:
  downloadFiles([["test_visual_feats.pkl", "1knSVmocw4-_FF98bFMdVSvhnUn3mPUvm"]])

try:
  _ = open("train_visual_feats.pkl", "r")
except:
  downloadFiles([["train_visual_feats.pkl", "1SyEptlqqX-frq_1hSQTxUGGptk6OI9aQ"]])

In [None]:
with open('train_bert_cls.pkl','rb') as f:
  train_bert_cls = pickle.load(f)

with open('train_visual_feats.pkl','rb') as f:
  train_visual_feats = pickle.load(f)

In [None]:
print(len(train_visual_feats.keys()))

800


Here we add another key "bert_cls" to store the extracted textual representation into train_doc_info.

In [None]:
bid = 0
bert_cls_list = train_bert_cls['bert_cls'].tolist()
for doc in train_visual_feats:
  for page in train_visual_feats[doc]['pages']:
    page_info = train_visual_feats[doc]['pages'][page]
    for i,obj in enumerate(page_info['objects']):
      objt = page_info['objects'][obj]
      objt['bert_cls'] = bert_cls_list[bid]
      objt['visual_feats'] = page_info['visual_list'][i]
      bid+=1

Let's load the key information from training dataframe

In [None]:
import ast
train_df = pd.read_csv('train_dataframe.csv')
train_df = train_df[['question', 'global_id','pmcid']]
train_df['global_id'] = train_df['global_id'].apply(ast.literal_eval)

### Add Positional Embedding
Positional encoding is vital for indicating the sequence information of transformer inputs.

In [None]:
import math
import torch
# one dimensional feature embedding
def positionalencoding1d(d_model, feature_list):
    """
    :param d_model: dimension of the model
    :param feature_list: length of positions
    :return: length*d_model position matrix
    """
    if d_model % 2 != 0:
        raise ValueError("Cannot use sin/cos positional encoding with "
                         "odd dim (got dim={:d})".format(d_model))
    pe = torch.zeros(1, d_model)
    feats = torch.tensor(feature_list)
    div_term = torch.exp((torch.arange(0, d_model, 2, dtype=torch.float) *
                         -(math.log(10000.0) / d_model)))
    pe[:, 0::2] = torch.sin(feats.float() * div_term)
    pe[:, 1::2] = torch.cos(feats.float() * div_term)
    pe = np.array(pe.tolist())
    return pe
positional_encoding = []
for i in range(400):
    positional_encoding.append(positionalencoding1d(768,i)[0])
positional_encoding = np.array(positional_encoding)

## Data Preprocessing

Define the dataprocessor for loading training samples. Because we use pre-trained "bert-base-uncased" model to encode the question, and a transformer to learn the contextual information between question and input object (RoI) sequence. Thus we need to carefully prepare the inputs of each modules in the dataset preparing session.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", truncation=True, do_lower_case=True)
def convert_index2onehot(ids, max_len):
  one_hot = [0]*max_len
  for id in ids:
    one_hot[int(id)] = 1
  return one_hot
class pdfvqa_training(Dataset):
    def __init__(self, dataframe, doc_info, tokenizer, max_len, position_emb):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.question
        self.global_ids = self.data.global_id
        self.doc_info = doc_info
        self.padding_len = max_len
        self.doc_id = dataframe.pmcid
        self.position_emb = position_emb

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        # First encode the question text into input_ids for feeding into selected pre-trained language model
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=100,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        # After getting the inputs for language model inputs.
        bert_cls = [] # Create a list to store the bert_cls features of input sequence
        visual_feats = [] # Create a list to store the visual features of input sequence
        doc_id = str(self.doc_id[index]) # Convert the int type pmic id to string type
        target_id = [] # Keep the answer id of input sequence
        t_id = 0 # Object ID of input sequence


        # Then we need to generate the target list and input sequnece
        # If any additional features are adopted, similar way to generate textual input sequence and be refered.
        for p in self.doc_info[doc_id]['pages']:
          page = self.doc_info[doc_id]['pages'][p]
          for obj in page['objects']:
            objt = page['objects'][obj]
            bert_cls.append(objt['bert_cls'])
            visual_feats.append(objt['visual_feats'])

            if objt['global_id'] in self.global_ids[index]:
              target_id.append(t_id)
            t_id += 1

        # Dealing with the empty output
        if len(target_id) == 0:
          target_id.append(399)

        # Ensure the all input sequences have the same input size (number of RoIs).
        if len(bert_cls) >= self.padding_len:
          bert_cls = bert_cls[:self.padding_len]
          visual_feats = visual_feats[:self.padding_len]

        else:
          bert_cls.extend([[0.0]*768]*(self.padding_len-len(bert_cls)))
          visual_feats.extend([[0.0]*2048]*(self.padding_len-len(visual_feats)))


        target = convert_index2onehot(target_id,self.padding_len)

        return {
            'ids': torch.tensor(ids, dtype=torch.long), # Question input_ids generated by tokenizer.encode of selected pre-trained model
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(target, dtype=torch.float),
            'bert_cls': torch.tensor(bert_cls, dtype=torch.float),
            'visual_feats': torch.tensor(visual_feats, dtype=torch.float),
            'position_emb': torch.tensor(self.position_emb, dtype=torch.float),
        }
training_set = pdfvqa_training(train_df, train_visual_feats, tokenizer, 400, positional_encoding)
train_params = {'batch_size': 8,
                'shuffle': True,
                'num_workers': 0
                }
training_loader = DataLoader(training_set, **train_params)

(…)cased/resolve/main/tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

(…)rt-base-uncased/resolve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

(…)bert-base-uncased/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)base-uncased/resolve/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Define the baseline models
We define a transformer-based baseline model to learn the contextual information of each RoI features. A pointer net is build on top of augmented feature representations.

In [None]:

import torch.nn.functional as F
class qa_models(torch.nn.Module):
    def __init__(self):
        super(qa_models, self).__init__()
        self.l1 = BertModel.from_pretrained("bert-base-uncased")# bert-base encoder to encode the question representation
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.visual_projector = torch.nn.Linear(2048,768)
        self.object_linear = torch.nn.Linear(768*2,768)

        self.encoder_layer = torch.nn.TransformerEncoderLayer(d_model=768, nhead=8,batch_first=True) # A transformer decoder layer to learn the coherenece between questions and RoI features
        self.transformer_encoder = torch.nn.TransformerEncoder(self.encoder_layer, num_layers=2)

        self.decoder_layer = torch.nn.TransformerDecoderLayer(d_model=768, nhead=8,batch_first=True) # A transformer decoder layer to learn the coherenece between questions and RoI features
        self.transformer_decoder = torch.nn.TransformerDecoder(self.decoder_layer, num_layers=2)
        self.classifier = torch.nn.Linear(768, 1)



    def forward(self, input_ids, attention_mask, token_type_ids, bert_cls, visual_feats, position_emb):
        # Extracting the question representation from bert-base backbone
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        question = hidden_state[:,:]
        question = self.pre_classifier(question)
        question = torch.nn.Tanh()(question)
        question = self.dropout(question)
        visual_feats = self.visual_projector(visual_feats)
        objt_emb = torch.cat((visual_feats,bert_cls),dim=2)
        objt_emb = self.object_linear(objt_emb)

        # Adding the position encoding to the RoI feature representations
        # In this baseline, we only use the textual representation in this baseline model. You can explore the effectiveness of other features.

        objt_emb = objt_emb + position_emb

        # Transformer decoder layer to learn the corelation between questions and RoI features
        encoder_output = self.transformer_encoder(objt_emb)

        decoder_output = self.transformer_decoder(encoder_output,question)

        # A pointernet layer to predict the answer
        output1 = self.classifier(decoder_output)
        output1 = output1.squeeze(2)

        return output1


In [None]:
model = qa_models()
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

qa_models(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

## Training

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=2e-05) # change learning rate

In [None]:
def calculate_exact_match_ratio(true_labels, predicted_labels):
    exact_match = 0
    for i in range(len(true_labels)):
      if true_labels[i] == predicted_labels[i]:
        exact_match += 1
    return exact_match / len(true_labels)

In [None]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
from sklearn.metrics import precision_score,f1_score
from tqdm import tqdm
def train(epoch):
  for etime in range(epoch):
    tr_loss = 0
    predict_list = []
    target_list = []
    model.train()
    for data in tqdm(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        bert_cls = data['bert_cls'].to(device, dtype = torch.float)
        visual_feats = data['visual_feats'].to(device, dtype = torch.float)

        position_emb = data['position_emb'].to(device, dtype = torch.float)

        outputs1 = model(ids, mask, token_type_ids, bert_cls, visual_feats, position_emb)
        loss = loss_function(outputs1, targets)
        preds = torch.sigmoid(outputs1).data > 0.5
        preds = preds.to(torch.int)
        predict_list.extend(preds.to("cpu").tolist())
        target_list.extend(targets.to("cpu").to(torch.int).tolist())
        tr_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()
    epoch_accu = calculate_exact_match_ratio(target_list,predict_list)
    print(epoch_accu)
  return

In [None]:
train(10)

  0%|          | 0/494 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  'bert_cls': torch.tensor(bert_cls, dtype=torch.float),
100%|██████████| 494/494 [06:17<00:00,  1.31it/s]


0.005315110098709187


100%|██████████| 494/494 [06:18<00:00,  1.31it/s]


0.09516578081498354


100%|██████████| 494/494 [06:15<00:00,  1.31it/s]


0.24727916983042267


100%|██████████| 494/494 [06:16<00:00,  1.31it/s]


0.26474310301189574


100%|██████████| 494/494 [06:16<00:00,  1.31it/s]


0.28170083523158695


100%|██████████| 494/494 [06:16<00:00,  1.31it/s]


0.2905593520627689


100%|██████████| 494/494 [06:16<00:00,  1.31it/s]


0.30372057706909644


100%|██████████| 494/494 [06:15<00:00,  1.32it/s]


0.3186535054416603


100%|██████████| 494/494 [06:15<00:00,  1.32it/s]


0.32067830928878766


100%|██████████| 494/494 [06:12<00:00,  1.33it/s]

0.3457352568969881





In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/textual_visual_epoch10.pth')


## Testing

Here we add another key "bert_cls" to store the extracted textual representation into train_doc_info.

In [None]:
with open('test_visual_feats.pkl','rb') as f:
  test_doc_info = pickle.load(f)

with open('test_bert_cls.pkl','rb') as f:
  test_bert_cls = pickle.load(f)

In [None]:
bid = 0
bert_cls_list = test_bert_cls['bert_cls'].tolist()
for doc in test_doc_info:
  for page in test_doc_info[doc]['pages']:
    page_info = test_doc_info[doc]['pages'][page]
    for i,obj in enumerate(page_info['objects']):
      objt = page_info['objects'][obj]
      objt['bert_cls'] = bert_cls_list[bid]
      objt['visual_feats'] = page_info['visual_list'][i]
      bid+=1

In [None]:
import ast
test_df = pd.read_csv('test_dataframe.csv')
test_df = test_df[['question','pmcid']]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", truncation=True, do_lower_case=True)
def convert_index2onehot(ids, max_len):
  one_hot = [0]*max_len
  for id in ids:
    one_hot[int(id)] = 1
  return one_hot
class pdfvqa_inference(Dataset):
    def __init__(self, dataframe, doc_info, tokenizer, max_len, position_emb):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.question
        self.doc_info = doc_info
        self.padding_len = max_len
        self.doc_id = dataframe.pmcid
        self.position_emb = position_emb

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=100,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        bert_cls = []
        visual_feats = []
        doc_id = str(self.doc_id[index])
        global_id_list = []

        for p in self.doc_info[doc_id]['pages']:
          page = self.doc_info[doc_id]['pages'][p]
          for obj in page['objects']:
            objt = page['objects'][obj]
            bert_cls.append(objt['bert_cls'])
            visual_feats.append(objt['visual_feats'])
            global_id_list.append(objt['global_id'])

        if len(bert_cls) >= self.padding_len:
          bert_cls = bert_cls[:self.padding_len]
          visual_feats = visual_feats[:self.padding_len]
          global_id_list = global_id_list[:self.padding_len]

        else:
          bert_cls.extend([[0.0]*768]*(self.padding_len-len(bert_cls)))
          visual_feats.extend([[0.0]*2048]*(self.padding_len-len(visual_feats)))
          global_id_list.extend([-2]*(self.padding_len-len(global_id_list)))
        global_id_list[-1] = -1



        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'bert_cls': torch.tensor(bert_cls, dtype=torch.float),
            'visual_feats': torch.tensor(visual_feats, dtype=torch.float),
            'position_emb': torch.tensor(self.position_emb, dtype=torch.float),
            'gloabl_id': torch.tensor(global_id_list, dtype=torch.long),
        }
test_set = pdfvqa_inference(test_df, test_doc_info, tokenizer, 400, positional_encoding)
test_params = {'batch_size': 8,
                'shuffle': False,
                'num_workers': 0
                }
testing_loader = DataLoader(test_set, **test_params)

In [None]:
def extracting(pre_ids, global_ids):
  outputs = []
  for j, pids in enumerate(pre_ids):
    output = []
    gids = global_ids[j]
    for i,pid in enumerate(pids):
      if pid == 1:
        output.append(gids[i])
    outputs.append(output)
  return outputs

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

from tqdm import tqdm
def inference():
    tr_loss = 0
    predict_list = []
    global_id_list = []
    model.eval()
    for data in tqdm(testing_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        gloabl_ids = data['gloabl_id'].to(device, dtype = torch.long)
        bert_cls = data['bert_cls'].to(device, dtype = torch.float)
        visual_feats = data['visual_feats'].to(device, dtype = torch.float)
        position_emb = data['position_emb'].to(device, dtype = torch.float)

        outputs1 = model(ids, mask, token_type_ids, bert_cls, visual_feats, position_emb)
        preds = torch.sigmoid(outputs1).data > 0.5
        preds = preds.to(torch.int)
        predict_list.extend(preds.to("cpu").tolist())
        global_id_list.extend(gloabl_ids.to("cpu").tolist())
    return predict_list, global_id_list

In [None]:
predict_list, global_id_list = inference()

  0%|          | 0/141 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 141/141 [01:40<00:00,  1.41it/s]


In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/textual_visual_epoch5.pth')


In [None]:
output = extracting(predict_list, global_id_list)

In [None]:
id_list = range(0, len(output))
df = pd.DataFrame(id_list, columns=['id'])
df['answer'] = output

df.to_csv('submission.csv', index=False)
