# RPT (Research Paper Tagger)

In [1]:
import os
import zipfile
import json
import random
from tqdm import tqdm
import plotly
import plotly.express as px
import plotly.graph_objects as go

import numpy as np
import pandas as pd

from helpers import tokenize_and_format, flat_accuracy

import torch
from transformers import BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup

from sklearn.metrics import precision_recall_fscore_support, top_k_accuracy_score
from sklearn.manifold import TSNE

In [2]:
random.seed(0)
np.random.seed(0)

torch.manual_seed(0)
torch.use_deterministic_algorithms(False)
# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: NVIDIA GeForce RTX 2060 with Max-Q Design, n_gpu: 1


In [3]:
with open("Data/Raw data/training_data.jsonl", "r") as f:
    training_data = json.load(f)
    
with open("Data/Raw data/validation_data.jsonl", "r") as f:
    validation_data = json.load(f)
    
with open("Data/Raw data/test_data.jsonl", "r") as f:
    test_data = json.load(f)
    
with open("Data/Metadata/label_string_to_ID.jsonl", "r") as f:
    label_string_to_ID = json.load(f)
    
with open("Data/Metadata/label_ID_to_string.jsonl", "r") as f:
    label_ID_to_string = json.load(f)

### Predictions using only abstract

In [4]:
training_inputs = []
training_label_strings = []

validation_inputs = []
validation_label_strings = []

test_inputs = []
test_label_strings = []

for training_example in training_data:
    
    training_input = training_example[0][0] + '. ' + training_example[0][2] + '. ' + training_example[0][1].replace(' |', ',')
    training_inputs.append(training_input)
    
    training_label_strings.append(training_example[1])
    
for validation_example in validation_data:
    
    validation_input = validation_example[0][0] + '. ' + validation_example[0][2] + '. ' + validation_example[0][1].replace(' |', ',')
    validation_inputs.append(validation_input)
    
    validation_label_strings.append(validation_example[1])
    
for test_example in test_data:
    
    test_input = test_example[0][0] + '. ' + test_example[0][2] + '. ' + test_example[0][1].replace(' |', ',')
    test_inputs.append(test_input)
    
    test_label_strings.append(test_example[1])

In [5]:
max_seq_length = 396

training_input_ids, training_attention_masks = tokenize_and_format(training_inputs, max_seq_length)
validation_input_ids, validation_attention_masks = tokenize_and_format(validation_inputs, max_seq_length)
test_input_ids, test_attention_masks = tokenize_and_format(test_inputs, max_seq_length)

In [6]:
training_label_IDs = []
validation_label_IDs = []
test_label_IDs = []

for training_label_string in training_label_strings:
    training_label_IDs.append(label_string_to_ID[training_label_string])
    
for validation_label_string in validation_label_strings:
    validation_label_IDs.append(label_string_to_ID[validation_label_string])
    
for test_label_string in test_label_strings:
    test_label_IDs.append(label_string_to_ID[test_label_string])
    
    
# Convert the lists into tensors.
training_input_ids = torch.cat(training_input_ids, dim=0)
training_attention_masks = torch.cat(training_attention_masks, dim=0)
training_label_IDs = torch.tensor(training_label_IDs)

validation_input_ids = torch.cat(validation_input_ids, dim=0)
validation_attention_masks = torch.cat(validation_attention_masks, dim=0)
validation_label_IDs = torch.tensor(validation_label_IDs)

test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)
test_label_IDs = torch.tensor(test_label_IDs)

In [7]:
train_set = [(training_input_ids[i], training_attention_masks[i], training_label_IDs[i]) for i in range(len(training_inputs))]
val_set = [(validation_input_ids[i], validation_attention_masks[i], validation_label_IDs[i]) for i in range(len(validation_inputs))]
test_set = [(test_input_ids[i], test_attention_masks[i], test_label_IDs[i]) for i in range(len(test_inputs))]

### Get the classification metrics for best model

In [8]:
best_hyperparameter_configuration = "Hyperparameter configuration 1"

model = BertForSequenceClassification.from_pretrained(
    "Saved models/" + best_hyperparameter_configuration + "/best validation accuracy model/",
    local_files_only = True,
    output_hidden_states = True, # Whether the model returns all hidden-states.
)

model.cuda()


batch_size = 8

def get_outputs(data_set):
    # Put the model in evaluation mode
    model.eval()

    num_batches = int(len(data_set)/batch_size) + 1

    total_correct = 0
    
    outputs = []
    
    all_labels = []
    all_logits = []
    
    CLS_vectors = []

    for i in range(num_batches):

        end_index = min(batch_size * (i+1), len(data_set))

        batch = data_set[i*batch_size:end_index]

        if len(batch) == 0: continue

        input_id_tensors = torch.stack([data[0] for data in batch])
        input_mask_tensors = torch.stack([data[1] for data in batch])
        label_tensors = torch.stack([data[2] for data in batch])

        # Move tensors to the GPU
        b_input_ids = input_id_tensors.to(device)
        b_input_mask = input_mask_tensors.to(device)
        b_labels = label_tensors.to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            
            logits = outputs.logits

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the number of correctly labeled examples in batch
            pred_flat = np.argmax(logits, axis=1).flatten()
            labels_flat = label_ids.flatten()
            
            all_labels.append(labels_flat)
            all_logits.append(logits)
            CLS_vectors.append(outputs[-1][-1][:, 0, :])
            
    
    all_labels = np.concatenate(all_labels)
    
    all_logits = np.concatenate(all_logits)
    all_logits = np.argmax(all_logits, axis = 1)
    
    CLS_vectors = torch.cat(CLS_vectors).cpu().numpy()
    
    return (CLS_vectors, all_logits, all_labels)

In [9]:
CLS_vectors, test_predictions, test_labels = get_outputs(test_set)

In [10]:
t_sne = TSNE()
t_sne_vectors = t_sne.fit_transform(X = CLS_vectors)



In [11]:
data_dictionary = dict()
data_dictionary['Input'] = test_inputs
data_dictionary['X'] = t_sne_vectors[:, 0]
data_dictionary['Y'] = t_sne_vectors[:, 1]
data_dictionary['predictions'] = [label_ID_to_string[str(pred)] for pred in test_predictions]
data_dictionary['labels'] = [label_ID_to_string[str(pred)] for pred in test_labels]

df = pd.DataFrame.from_dict(data_dictionary)

In [29]:
df.to_csv("RPT_misclassified.csv")

In [30]:
df[df['X'] == 6.839929]

Unnamed: 0,Input,X,Y,predictions,labels
48,which linguist invented the lightbulb? presupp...,6.839929,7.583899,question answering,discourse and pragmatics


In [26]:
fig = px.scatter(df, x="X", y="Y", color = "predictions", color_discrete_sequence=["black", "blueviolet", "chocolate", "darkgray", "dodgerblue", "greenyellow", "gold", "lightgreen", "brown", "cyan", "maroon", "olive", "red", "teal", "violet", "mediumblue", "mediumpurple", "pink", "crimson", "orange"])


fig.show()
#plotly.offline.plot(fig, filename = "predictions_cluster.html")
#fig.write_image("predictions_cluster.png")

In [27]:
fig = px.scatter(df, x="X", y="Y", color = "labels", color_discrete_sequence=["black", "blueviolet", "chocolate", "darkgray", "dodgerblue", "greenyellow", "gold", "lightgreen", "brown", "cyan", "maroon", "olive", "red", "teal", "violet", "mediumblue", "mediumpurple", "pink", "crimson", "orange"])
fig.show()
#plotly.offline.plot(fig, filename = "labels_cluster.html")
#fig.write_image("labels_cluster.png")

In [45]:
for point in df[df['predictions'] != df['labels']].values:
    
    if(point[4] == "nlp applications"):
        print(point[0])
        print("\n")
        print(point[3])
        print("\n")
        print(point[4])
        print("\n\n\n\n")

unsupervised extractive summarization-based representations for accurate and explainable collaborative filtering. we pioneer the first extractive summarization-based collaborative filtering model called escofilt. our proposed model specifically produces extractive summaries for each item and user. unlike other types of explanations, summary-level explanations closely resemble real-life explanations. the strength of escofilt lies in the fact that it unifies representation and explanation. in other words, extractive summaries both represent and explain the items and users. our model uniquely integrates bert, k-means embedding clustering, and multilayer perceptron to learn sentence embeddings, representation-explanations, and user-item interactions, respectively. we argue that our approach enhances both rating prediction accuracy and user/item explainability. our experiments illustrate that escofilt’s prediction accuracy is better than the other state-of-the-art recommender models. furthe

In [42]:
df['predictions'] == "nlp applications"

0      False
1      False
2      False
3      False
4      False
       ...  
178    False
179    False
180    False
181    False
182    False
Name: predictions, Length: 183, dtype: bool