In [None]:
!pip3 install torch
!pip3 install torchvision
!pip3 install sentencepiece
!pip3 install transformers
!pip3 install datasets
!pip install rouge-score
!pip install sacrebleu
!pip install git+https://github.com/google-research/bleurt.git

In [None]:
import datetime
import os
import time
import sys
import re
import numpy as np
import random
import pandas as pd
import nltk
import numpy as np
import json

import torch
torch.manual_seed(49)
from datasets import load_metric, load_dataset 
from transformers import AutoTokenizer,  AutoModelForSeq2SeqLM

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import statistics


nltk.download('punkt')

In [None]:
model_checkpoint = ''
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model =   AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)


In [None]:
test = pd.read_csv('data/c2t-big/test_c2t_big.csv', sep='\t')
test

In [None]:
test_data = list(test['Data'])
test_summary = list(test['Summaries'])

## Beam Search

In [None]:
hypo_summary = list()
counter = 0
for text in test_data:
    print(counter)
    tokens = tokenizer.encode('C2T: ' + text,  truncation=True, padding='max_length', return_tensors='pt').to('cuda')
    generated = model.generate(tokens, num_beams=4, max_length = 256)
    tgt_text = tokenizer.decode(generated[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    summary = str(tgt_text).strip('[]""')
    print(summary)
    hypo_summary.append(summary.replace('\n',''))
    counter = counter + 1

In [None]:
hypo_file = open('hypothesis.txt', 'w')
for i in hypo_summary_1:
    hypo_file.write(i.replace('\n','') + '\n')
hypo_file.close()

## ROUGE and BLEU

In [None]:
metric = load_metric('rouge')
ref = ["\n".join(nltk.sent_tokenize(label.strip())) for label in test_summary]
beam_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in hypo_summary]
result = metric.compute(predictions=beam_preds, references=ref, use_stemmer=True)
beam_rouge = {key: value.mid.fmeasure * 100 for key, value in result.items()}
beam_rouge

In [None]:
metric = load_metric('sacrebleu')

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(preds,ref):

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(preds, ref)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    r = result['score']
    return r

In [None]:
r = compute_metrics(hypo_summary,test_summary)
print(r)

## BLEURT score

In [None]:
metric = load_metric('bleurt',module_type="metric", checkpoint="bleurt-base-128")

In [None]:
bleurt = metric.compute(predictions=hypo_summary, references=test_summary)

In [None]:
print(statistics.mean(list(bleurt['scores'])))

##GPT-2 Perplexity

In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from tqdm import tqdm

In [None]:
device = "cuda"
model_id = "gpt2-medium"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

In [None]:
encodings = tokenizer("\n\n".join(hypo_summary), return_tensors="pt")

In [None]:
max_length = model.config.n_positions
stride = 512

nlls = []
for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
    begin_loc = max(i + stride - max_length, 0)
    end_loc = min(i + stride, encodings.input_ids.size(1))
    trg_len = end_loc - i  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)
        neg_log_likelihood = outputs[0] * trg_len

    nlls.append(neg_log_likelihood)

ppl = torch.exp(torch.stack(nlls).sum() / end_loc)

In [None]:
ppl

## NUBIA

In [None]:
!git clone https://github.com/wl-research/nubia.git
import os
os.chdir('nubia')
!pip install -r requirements.txt

In [None]:
#device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
from nubia_score import Nubia
import statistics
metric = Nubia()

In [None]:
beam_nubia_score = list()
beam_logical_agreement = list()
beam_semantic_relation = list()
beam_irrelevancy = list() 
beam_contradiction = list()

In [None]:
count = 0
for i, j in zip(hypo_summary_1, test_summary):
  x = metric.score(i,j, get_features=True)
  beam_nubia_score.append(x['nubia_score'])
  count += 1
  beam_logical_agreement.append(x['features']['logical_agreement'])
  beam_semantic_relation.append(x['features']['semantic_relation'])
  beam_irrelevancy.append(x['features']['irrelevancy'])
  beam_contradiction.append(x['features']['contradiction'])
  beam_grammar_ref.append(x['features']['grammar_ref'])
  beam_grammar_hyp.append(x['features']['grammar_hyp'])
  print(count)

In [None]:
print(statistics.mean(beam_nubia_score))
print(statistics.mean(beam_logical_agreement))
print(statistics.mean(beam_semantic_relation))
print(statistics.mean(beam_irrelevancy))
print(statistics.mean(beam_contradiction))