# Code to use previous generated trained model and get summmary for new files

In [25]:
import fitz
import nltk
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
import pandas as pd
import numpy as np
from transformers import TrainingArguments, Trainer
from datasets import load_metric
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from torch.autograd import Variable
from sklearn.model_selection import train_test_split
import os
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence, pad_packed_sequence
from sklearn.metrics import f1_score, accuracy_score
import matplotlib.pyplot as plt
from models import LSTMNet
from constants import BASE_DIR

In [29]:
PDF_FILE = os.path.join(BASE_DIR, 'papers', 'P19-1106.pdf')

### Extract data from new pdf file

In [32]:
def get_new_file_abstract(file_name):
    """ Extract abstract from pdf file text"""
    with fitz.open(file_name) as doc:
        for page in doc:
            blocks = page.get_text('blocks')    
            start_abstract = False
            for block in blocks:
                if block[4].lower().strip() == 'abstract':
                    start_abstract = True
                    continue
                if start_abstract:
                    abstract=block[4]
                    start_abstract = False
                    break
    return abstract

In [35]:
def get_new_file_sentences(file_name):
    """ Extract all sentences from pdf file text"""
    all_sentences = []
    with fitz.open(file_name) as doc:
        for page in doc:
            blocks = page.get_text('blocks')    
            for block in blocks:
                if len(block[4].split()) > 10 and '@' not in block[4] and 'Proceedings' not in block[4]:
                    all_sentences.extend(nltk.sent_tokenize(block[4]))
    return all_sentences


In [36]:
abstract = get_new_file_abstract(PDF_FILE)
sentences = get_new_file_sentences(PDF_FILE)

### Generate embeddings for sentences in file

In [37]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)

### Use trained model to get contributing statements for given file

In [38]:
lstm_net = torch.load(os.path.join(BASE_DIR, 'models', 'trained_model.pt'))
HIDDEN_SIZE = 200
THRESHOLD = 0.6
inputs = torch.Tensor(embeddings)
inputs = torch.unsqueeze(inputs, 0)
random_inputs = torch.zeros(inputs.size())
final_inputs = torch.cat((inputs, random_inputs), 0)
packed_input = pack_padded_sequence(final_inputs, torch.IntTensor([290,290]), batch_first=True, enforce_sorted=False)
h_0 = Variable(torch.zeros(1, final_inputs.shape[0], HIDDEN_SIZE))
c_0 = Variable(torch.zeros(1, final_inputs.shape[0], HIDDEN_SIZE))
outputs = lstm_net(packed_input, h_0, c_0)
outputs = torch.squeeze(outputs)
predicted = (outputs > THRESHOLD).int()

In [39]:
contrib_predictions = predicted[0]
contrib_predictions = (contrib_predictions == 1).nonzero()
contributing_sentences = [sentences[i[0]] for i in contrib_predictions]

### Generate abstract summary for contributing statements using Bart model

In [41]:
from transformers import BartForConditionalGeneration, BartTokenizer
import torch

model_name = "sshleifer/distilbart-cnn-12-6"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)

In [42]:
batch = tokenizer(' '.join(contributing_sentences), truncation=True, padding="longest", return_tensors="pt").to(device)
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
print(tgt_text)

[' The present-day peer review process is not straightforward and demands the domain knowledge, expertise, and intelligence of human reviewers. We investigate the role of reviewers’sentiments embedded within peer review texts to predict the peer review outcome. We attribute this to the use of deep neural networks and augmentation of review sentiment information in our new architecture.']


### Calculate scores for various Rouge and Bert Score metrics

In [17]:
from datasets import load_metric

rouge_score = load_metric("rouge")
scores = rouge_score.compute(
    predictions=[tgt_text], references=[abstract]
)
print(scores['rouge1'].mid)
print(scores['rouge2'].mid)
print(scores['rougeL'].mid)

Score(precision=0.9137931034482759, recall=0.2245762711864407, fmeasure=0.36054421768707484)
Score(precision=0.6140350877192983, recall=0.14893617021276595, fmeasure=0.23972602739726026)
Score(precision=0.7068965517241379, recall=0.17372881355932204, fmeasure=0.27891156462585037)


In [18]:
from datasets import load_metric
bert_score_metric = load_metric("bertscore")
bert_scores = bert_score_metric.compute(
    predictions=[tgt_text], references=[abstract], lang='en'
)
precision = np.average(bert_scores['precision'])
recall = np.average(bert_scores['recall'])
f1_score = np.average(bert_scores['f1'])
print(f'Bert scores - Precision: {precision}, Recall: {recall}, F1 score: {f1_score}')

Bert scores - Precision: 0.8878797888755798, Recall: 0.7869012355804443, F1 score: 0.834346354007721
