# RPT (Research Paper Tagger)

In [1]:
import os
import zipfile
import json
import random
from tqdm import tqdm
import plotly
import plotly.express as px
import plotly.graph_objects as go

import numpy as np
import pandas as pd

from helpers import tokenize_and_format, flat_accuracy

import torch
from transformers import BertForSequenceClassification, DistilBertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup

In [2]:
random.seed(0)
np.random.seed(0)

torch.manual_seed(0)
torch.use_deterministic_algorithms(False)
# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: NVIDIA GeForce RTX 2060 with Max-Q Design, n_gpu: 1


In [3]:
with open("Data/Raw data/training_data.jsonl", "r") as f:
    training_data = json.load(f)
    
with open("Data/Raw data/validation_data.jsonl", "r") as f:
    validation_data = json.load(f)
    
with open("Data/Raw data/test_data.jsonl", "r") as f:
    test_data = json.load(f)
    
with open("Data/Metadata/label_string_to_ID.jsonl", "r") as f:
    label_string_to_ID = json.load(f)
    
with open("Data/Metadata/label_ID_to_string.jsonl", "r") as f:
    label_ID_to_string = json.load(f)

### Predictions using only abstract

In [4]:
training_inputs = []
training_label_strings = []

validation_inputs = []
validation_label_strings = []

test_inputs = []
test_label_strings = []

for training_example in training_data:
    
    training_input = training_example[0][2]
    training_inputs.append(training_input)
    
    training_label_strings.append(training_example[1])
    
for validation_example in validation_data:
    
    validation_input = validation_example[0][2]
    validation_inputs.append(validation_input)
    
    validation_label_strings.append(validation_example[1])
    
for test_example in test_data:
    
    test_input = test_example[0][2]
    test_inputs.append(test_input)
    
    test_label_strings.append(test_example[1])

In [5]:
max_seq_length = 300

training_input_ids, training_attention_masks = tokenize_and_format(training_inputs, max_seq_length)
validation_input_ids, validation_attention_masks = tokenize_and_format(validation_inputs, max_seq_length)
test_input_ids, test_attention_masks = tokenize_and_format(test_inputs, max_seq_length)

In [6]:
training_label_IDs = []
validation_label_IDs = []
test_label_IDs = []

for training_label_string in training_label_strings:
    training_label_IDs.append(label_string_to_ID[training_label_string])
    
for validation_label_string in validation_label_strings:
    validation_label_IDs.append(label_string_to_ID[validation_label_string])
    
for test_label_string in test_label_strings:
    test_label_IDs.append(label_string_to_ID[test_label_string])
    
    
# Convert the lists into tensors.
training_input_ids = torch.cat(training_input_ids, dim=0)
training_attention_masks = torch.cat(training_attention_masks, dim=0)
training_label_IDs = torch.tensor(training_label_IDs)

validation_input_ids = torch.cat(validation_input_ids, dim=0)
validation_attention_masks = torch.cat(validation_attention_masks, dim=0)
validation_label_IDs = torch.tensor(validation_label_IDs)

test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)
test_label_IDs = torch.tensor(test_label_IDs)

In [7]:
train_set = [(training_input_ids[i], training_attention_masks[i], training_label_IDs[i]) for i in range(len(training_inputs))]
val_set = [(validation_input_ids[i], validation_attention_masks[i], validation_label_IDs[i]) for i in range(len(validation_inputs))]
test_set = [(test_input_ids[i], test_attention_masks[i], test_label_IDs[i]) for i in range(len(test_inputs))]

#### Fine-tune the BERT model

In [9]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 20, # The number of output labels.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()





hyperparameter_config_iter = 1

save_path = "Saved models/Hyperparameter configuration " + str(hyperparameter_config_iter)

if(os.path.exists(save_path)):
    raise Exception("ERROR! Hyperparameter config " + str(hyperparameter_config_iter))

else:
    os.makedirs(save_path)
    os.makedirs(save_path + "/Plots")


# Fine-tuning hyperparameters

batch_size = 16
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                )
epochs = 15

hyperparameter_dict = dict()
hyperparameter_dict['batch_size'] = batch_size
hyperparameter_dict['epochs'] = epochs

with open(save_path + "/Hyperparameters.json", 'w') as f:
    json.dump(hyperparameter_dict, f)

def save(model, optimizer, output_path):
    torch.save({'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, output_path)

# function to get validation accuracy
def get_performance(data_set):
    # Put the model in evaluation mode
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0

    num_batches = int(len(data_set)/batch_size) + 1

    total_correct = 0

    for i in range(num_batches):

        end_index = min(batch_size * (i+1), len(data_set))

        batch = data_set[i*batch_size:end_index]

        if len(batch) == 0: continue

        input_id_tensors = torch.stack([data[0] for data in batch])
        input_mask_tensors = torch.stack([data[1] for data in batch])
        label_tensors = torch.stack([data[2] for data in batch])

        # Move tensors to the GPU
        b_input_ids = input_id_tensors.to(device)
        b_input_mask = input_mask_tensors.to(device)
        b_labels = label_tensors.to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            outputs = model(b_input_ids,
                                    attention_mask=b_input_mask,
                                    labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

            # Accumulate the validation loss.
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the number of correctly labeled examples in batch
            pred_flat = np.argmax(logits, axis=1).flatten()
            labels_flat = label_ids.flatten()
            num_correct = np.sum(pred_flat == labels_flat)
            total_correct += num_correct
        
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_correct / len(data_set)
    return avg_val_accuracy



# training loop

max_val_acc = -1

metric_vs_epoch = dict()

epoch_list = []
training_loss_list = []
training_acc_list = []

val_acc_list = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    epoch_list.append(epoch_i + 1)
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    num_batches = int(len(train_set)/batch_size) + 1

    for i in tqdm(range(num_batches)):
        
        end_index = min(batch_size * (i+1), len(train_set))

        batch = train_set[i*batch_size:end_index]

        if len(batch) == 0: continue

        input_id_tensors = torch.stack([data[0] for data in batch])
        input_mask_tensors = torch.stack([data[1] for data in batch])
        label_tensors = torch.stack([data[2] for data in batch])

        # Move tensors to the GPU
        b_input_ids = input_id_tensors.to(device)
        b_input_mask = input_mask_tensors.to(device)
        b_labels = label_tensors.to(device)

        # Clear the previously calculated gradient
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        outputs = model(b_input_ids, 
                            attention_mask=b_input_mask, 
                            labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits

        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Update parameters and take a step using the computed gradient.
        optimizer.step()
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set. Implement this function in the cell above.
    
    training_acc = get_performance(train_set)
    val_acc = get_performance(val_set)
    
    print(f"Total loss: {total_train_loss}")
    print(f"Validation accuracy: {val_acc}")
    
    val_acc_list.append(val_acc)
    training_acc_list.append(training_acc)
    
    training_loss_list.append(total_train_loss)
    
    if(val_acc > max_val_acc):
        
        max_val_acc = val_acc
        
        model.save_pretrained(save_path + "/best validation accuracy model")
        save(model, optimizer, save_path + "/best validation accuracy.modelState")
    
    
print("")
print("Training complete!")            

metric_vs_epoch["Epochs"] = epoch_list
metric_vs_epoch["Training loss"] = training_loss_list
metric_vs_epoch["Training accuracy"] = training_acc_list
metric_vs_epoch["Validation accuracy"] = val_acc_list

with open(save_path + "/Plots/Plot data.json", 'w') as f:
    json.dump(metric_vs_epoch, f)

metric_vs_epoch_df = pd.DataFrame(metric_vs_epoch, columns = ["Epochs", "Training loss", "Training accuracy", "Validation accuracy"])

fig = px.line(metric_vs_epoch_df, x='Epochs', y="Training loss", title="Training loss vs epochs")
plotly.offline.plot(fig, filename = save_path + "/Plots/Training loss.html")

accuracy_vs_epoch = dict()
accuracy_vs_epoch["Epochs"] = epoch_list + epoch_list
accuracy_vs_epoch["Accuracy"] = training_acc_list + val_acc_list
accuracy_vs_epoch["Dataset"] = ["Training"]*len(training_acc_list) + ["Validation"]*len(val_acc_list)

accuracy_vs_epoch_df = pd.DataFrame(accuracy_vs_epoch, columns = ["Epochs", "Accuracy", "Dataset"])

fig = px.line(accuracy_vs_epoch_df, x='Epochs', y='Accuracy', color='Dataset', markers=True, title="Training/Validation accuracy vs epochs")
plotly.offline.plot(fig, filename = save_path + "/Plots/Accuracy.html")

with open(save_path + "/Best validation accuracy.txt", 'w') as f:
    f.write("Best validation accuracy: " + str(max_val_acc))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier


Training...


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:38<00:00,  2.23it/s]


Total loss: 228.72888100147247
Validation accuracy: 0.44571428571428573

Training...


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:37<00:00,  2.34it/s]


Total loss: 159.88833129405975
Validation accuracy: 0.5142857142857142

Training...


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:37<00:00,  2.32it/s]


Total loss: 114.43488538265228
Validation accuracy: 0.5314285714285715

Training...


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:37<00:00,  2.30it/s]


Total loss: 78.30347815155983
Validation accuracy: 0.5371428571428571

Training...


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:38<00:00,  2.29it/s]


Total loss: 50.192843556404114
Validation accuracy: 0.5885714285714285

Training...


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:38<00:00,  2.28it/s]


Total loss: 32.76849787682295
Validation accuracy: 0.5885714285714285

Training...


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:38<00:00,  2.27it/s]


Total loss: 23.614997006952763
Validation accuracy: 0.5942857142857143

Training...


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:38<00:00,  2.26it/s]


Total loss: 15.03026608005166
Validation accuracy: 0.6285714285714286

Training...


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:38<00:00,  2.26it/s]


Total loss: 10.321120761334896
Validation accuracy: 0.6

Training...


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:38<00:00,  2.25it/s]


Total loss: 8.06671422533691
Validation accuracy: 0.5542857142857143

Training...


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:38<00:00,  2.25it/s]


Total loss: 6.692962887696922
Validation accuracy: 0.6

Training...


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:38<00:00,  2.25it/s]


Total loss: 3.3365700813010335
Validation accuracy: 0.5942857142857143

Training...


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:38<00:00,  2.24it/s]


Total loss: 3.185480300337076
Validation accuracy: 0.5657142857142857

Training...


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:38<00:00,  2.24it/s]


Total loss: 3.4594324002973735
Validation accuracy: 0.5828571428571429

Training...


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:38<00:00,  2.24it/s]


Total loss: 4.78477878915146
Validation accuracy: 0.6

Training complete!


### 4) Prediction using title, abstract, and the list of authors

In [None]:
"""
training_inputs = []
training_label_strings = []

validation_inputs = []
validation_label_strings = []

test_inputs = []
test_label_strings = []

for training_example in training_data:
    
    training_input = training_example[0][0] + ' [SEP] ' + training_example[0][2] + ' [SEP] ' + training_example[0][1].replace(' |', ',')
    training_inputs.append(training_input)
    
    training_label_strings.append(training_example[1])
    
for validation_example in validation_data:
    
    validation_input = validation_example[0][0] + ' [SEP] ' + validation_example[0][2] + ' [SEP] ' + validation_example[0][1].replace(' |', ',')
    validation_inputs.append(validation_input)
    
    validation_label_strings.append(validation_example[1])
    
for test_example in test_data:
    
    test_input = test_example[0][0] + ' [SEP] ' + test_example[0][2] + ' [SEP] ' + test_example[0][1].replace(' |', ',')
    test_inputs.append(test_input)
    
    test_label_strings.append(test_example[1])
"""

In [None]:
"""
training_input_ids, training_attention_masks = tokenize_and_format(training_inputs)
validation_input_ids, validation_attention_masks = tokenize_and_format(validation_inputs)
test_input_ids, test_attention_masks = tokenize_and_format(test_inputs)
"""

In [None]:
"""
training_label_IDs = []
validation_label_IDs = []
test_label_IDs = []

for training_label_string in training_label_strings:
    training_label_IDs.append(label_string_to_ID[training_label_string])
    
for validation_label_string in validation_label_strings:
    validation_label_IDs.append(label_string_to_ID[validation_label_string])
    
for test_label_string in test_label_strings:
    test_label_IDs.append(label_string_to_ID[test_label_string])
    
    
# Convert the lists into tensors.
training_input_ids = torch.cat(training_input_ids, dim=0)
training_attention_masks = torch.cat(training_attention_masks, dim=0)
training_label_IDs = torch.tensor(training_label_IDs)

validation_input_ids = torch.cat(validation_input_ids, dim=0)
validation_attention_masks = torch.cat(validation_attention_masks, dim=0)
validation_label_IDs = torch.tensor(validation_label_IDs)

test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)
test_label_IDs = torch.tensor(test_label_IDs)
"""

In [None]:
"""
train_set = [(training_input_ids[i], training_attention_masks[i], training_label_IDs[i]) for i in range(len(training_inputs))]
val_set = [(validation_input_ids[i], validation_attention_masks[i], validation_label_IDs[i]) for i in range(len(validation_inputs))]
test_set = [(test_input_ids[i], test_attention_masks[i], test_label_IDs[i]) for i in range(len(test_inputs))]
"""

In [None]:
"""
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 20, # The number of output labels.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()
"""

In [None]:
"""
# Fine-tuning hyperparameters

batch_size = 16
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                )
epochs = 5


# function to get validation accuracy
def get_validation_performance(val_set):
    # Put the model in evaluation mode
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0

    num_batches = int(len(val_set)/batch_size) + 1

    total_correct = 0

    for i in range(num_batches):

        end_index = min(batch_size * (i+1), len(val_set))

        batch = val_set[i*batch_size:end_index]

        if len(batch) == 0: continue

        input_id_tensors = torch.stack([data[0] for data in batch])
        input_mask_tensors = torch.stack([data[1] for data in batch])
        label_tensors = torch.stack([data[2] for data in batch])

        # Move tensors to the GPU
        b_input_ids = input_id_tensors.to(device)
        b_input_mask = input_mask_tensors.to(device)
        b_labels = label_tensors.to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            outputs = model(b_input_ids, 
                                    token_type_ids=None, 
                                    attention_mask=b_input_mask,
                                    labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

            # Accumulate the validation loss.
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the number of correctly labeled examples in batch
            pred_flat = np.argmax(logits, axis=1).flatten()
            labels_flat = label_ids.flatten()
            num_correct = np.sum(pred_flat == labels_flat)
            total_correct += num_correct
        
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_correct / len(val_set)
    return avg_val_accuracy



# training loop

# For each epoch...
for epoch_i in range(0, epochs):
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    num_batches = int(len(train_set)/batch_size) + 1

    for i in range(num_batches):
      
        print("Batch " + str(i) + " out of " + str(num_batches) + " batches.")
        
        end_index = min(batch_size * (i+1), len(train_set))

        batch = train_set[i*batch_size:end_index]

        if len(batch) == 0: continue

        input_id_tensors = torch.stack([data[0] for data in batch])
        input_mask_tensors = torch.stack([data[1] for data in batch])
        label_tensors = torch.stack([data[2] for data in batch])

        # Move tensors to the GPU
        b_input_ids = input_id_tensors.to(device)
        b_input_mask = input_mask_tensors.to(device)
        b_labels = label_tensors.to(device)

        # Clear the previously calculated gradient
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits

        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Update parameters and take a step using the computed gradient.
        optimizer.step()
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set. Implement this function in the cell above.
    print(f"Total loss: {total_train_loss}")
    val_acc = get_validation_performance(val_set)
    print(f"Validation accuracy: {val_acc}")
    
print("")
print("Training complete!")
"""