In [3]:
# !pip install transformers
# !pip install wandb
# !pip install datasets
# !pip install bert_score
# !pip install evaluate
# !pip install accelerate
# !pip install gradio --upgrade
# !pip install --upgrade torch

In [1]:
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, RobertaTokenizer, AutoModelForQuestionAnswering
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import wandb
from datasets import Dataset
import os
from collections import defaultdict
import numpy as np
from torch.utils.data import DataLoader
from transformers import AdamW
import evaluate
from transformers import default_data_collator
from transformers import Trainer
from tqdm.auto import tqdm
from transformers import TrainingArguments
from accelerate import Accelerator
from transformers import get_scheduler
import gradio as gr

os.environ["TOKENIZERS_PARALLELISM"] = "false"

tqdm.pandas()

In [2]:
# device (turn on GPU acceleration for faster execution)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

## Pre-processing

In [3]:
errorUuid = {"ad9271b7-9983-42f5-9bd9-fdfcb171ddaa":[[[4, 37],[4, 222]]]}
def parse_spoiler(x):
    spoiler = []
    if x['uuid'] in errorUuid:
        x['spoilerPositions'] = errorUuid[x['uuid']]

    for s in x['spoilerPositions']:
        st,en = s[0],s[1]
        spoiler.append(x['targetParagraphs'][st[0]][st[1]:en[1]])
        
    return spoiler

def findPosTags(x):    
    tokPos = []
    for pos in x['spoilerPositions']:
        st,en = pos
        idx = 0
        for i,p in enumerate([x['targetTitle']] + x['targetParagraphs']):
            if i==st[0]+1:
                start_ind = idx+st[1]
                end_ind = idx + en[1]
                
                tokPos.append([start_ind,end_ind])
                break
            if i==0:
                idx+=len(p)+3
            else:
                idx+=len(p)+1
        
    return tokPos

def read_prep(path,train=True):
    with open(path, 'rb') as json_file:
        json_list = list(json_file)

    results = []
    for json_str in json_list:
        result = json.loads(json_str)
        results.append(result)
    df = pd.DataFrame(results)
    df['tags'] = df.tags.apply(lambda x:x[0],1)
    df['postText'] = df.postText.apply(lambda x:x[0],1)    
    
    # Parsing for faulty spoiler ids
    df['spoilerParsed'] = df.apply(parse_spoiler,1)
    df['mergedParas'] = df['targetParagraphs'].apply(lambda x:" ".join(x),1)
    df.mergedParas = df.targetTitle + " - " + df.mergedParas
    df['tokPos'] = df.apply(findPosTags,1)
    df['label'] = df['tags'].map({"phrase":0,"passage":1,"multi":2})
    
    return df
    

In [4]:
df_train = read_prep("./data/train.jsonl")
df_valid = read_prep("./data/validation.jsonl")

## Task 1: Spoiler Classification

In [7]:
config = dict(
    epochs=5,
    classes=3,
    batch_size=4,
    learning_rate=1e-5,
    model="roberta-base")

In [8]:
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer = AutoTokenizer.from_pretrained(config['model'], do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained(config['model'],num_labels=config['classes'])

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [9]:
def tokenize(examples):
    return tokenizer(examples['postText'],
                    examples['mergedParas'],
                    truncation=True,
                    max_length=512,
                    padding=True,
                    return_tensors='pt')

In [10]:
train_data = Dataset.from_pandas(df_train[['postText',"mergedParas","label"]])
val_data = Dataset.from_pandas(df_valid[['postText',"mergedParas","label"]])

In [11]:
train_dataset = train_data.map(tokenize,batched=True,remove_columns=["postText","mergedParas"])
train_dataset.set_format("torch")

  0%|          | 0/4 [00:00<?, ?ba/s]

In [12]:
val_dataset = val_data.map(tokenize,batched=True,remove_columns=["postText","mergedParas"])
val_dataset.set_format("torch")

  0%|          | 0/1 [00:00<?, ?ba/s]

In [13]:
def compute_metrics(eval_preds):
    preds = np.argmax(eval_preds.predictions,1)
    
    return {"accuracy":(preds == eval_preds.label_ids).astype(np.float32).mean().item() }

In [14]:
training_args = TrainingArguments(
    output_dir="./trainer_task1_bert",
    learning_rate=config['learning_rate'],
    per_device_train_batch_size=config['batch_size'],
    per_device_eval_batch_size=config['batch_size'],
    num_train_epochs=config['epochs'],
    weight_decay=0.01,
    evaluation_strategy ="epoch",
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics,
)

trainer.train()

***** Running training *****
  Num examples = 3200
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 4000
  Number of trainable parameters = 124647939
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mrajasvi[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0303,0.862932,0.6
2,0.791,0.684117,0.73125
3,0.7062,0.788917,0.75
4,0.5978,1.042842,0.71875
5,0.5083,1.156266,0.73


***** Running Evaluation *****
  Num examples = 800
  Batch size = 4
***** Running Evaluation *****
  Num examples = 800
  Batch size = 4
***** Running Evaluation *****
  Num examples = 800
  Batch size = 4
***** Running Evaluation *****
  Num examples = 800
  Batch size = 4
***** Running Evaluation *****
  Num examples = 800
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=4000, training_loss=0.711951374053955, metrics={'train_runtime': 746.6664, 'train_samples_per_second': 21.429, 'train_steps_per_second': 5.357, 'total_flos': 4209814683648000.0, 'train_loss': 0.711951374053955, 'epoch': 5.0})

In [16]:
trainer.save_model("./models_task1/")

Saving model checkpoint to ./models_task1/
Configuration saved in ./models_task1/config.json
Model weights saved in ./models_task1/pytorch_model.bin
tokenizer config file saved in ./models_task1/tokenizer_config.json
Special tokens file saved in ./models_task1/special_tokens_map.json


## Task 2: Spoiler Generation

## Phrase/Passage Spoiler Generation using QA model

In [5]:
def convert2squadFormat(df):
    df_fin = df[['uuid','targetTitle','postText',"mergedParas","tokPos","spoiler"]]
    df_fin["asnwers"] = df_fin.apply(lambda x: {'text':x['spoiler'], "answer_start":[x['tokPos'][0][0]]},1)
    df_fin = df_fin.drop(columns=["tokPos","spoiler"])
    df_fin.columns = ["id","title","question","context","answers"]
    
    return df_fin

In [6]:
bleu = evaluate.load("bleu")
squad_metric = evaluate.load("squad")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [7]:
# Based on spoiler type you want to train, "phrase" can be replaced with "passage"
spoiler_type = "phrase"
train_df = df_train[df_train.tags==spoiler_type]
val_df = df_valid[df_valid.tags==spoiler_type]

len(train_df),len(val_df)

(1274, 322)

In [8]:
train_df = convert2squadFormat(train_df)
val_df = convert2squadFormat(val_df)

train_data = Dataset.from_pandas(train_df.reset_index(drop=True), split="train")
val_data = Dataset.from_pandas(val_df.reset_index(drop=True), split="test")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fin["asnwers"] = df_fin.apply(lambda x: {'text':x['spoiler'], "answer_start":[x['tokPos'][0][0]]},1)


In [9]:
def preprocess_training_examples(examples):
    questions = examples['question']
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [10]:
def preprocess_validation_examples(examples):
    questions = examples["question"]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [11]:
def compute_metrics(start_logits, end_logits, features, examples, predictOnly=False):
    example_to_features = defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_spoilers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        spoilers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    spoilers.append(answer)

        # Select the answer with the best score
        if len(spoilers) > 0:
            best_answer = max(spoilers, key=lambda x: x["logit_score"])
            predicted_spoilers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_spoilers.append({"id": example_id, "prediction_text": ""})
            
    predicted_texts = [i['prediction_text'] for i in predicted_spoilers]
    
    if predictOnly:
        return predicted_texts
    
    actual_spoilers_squad = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    actual_spoilers = [i['answers']['text'][0] for i in actual_spoilers_squad]    
    
    squad_metrics_eval = squad_metric.compute(predictions=predicted_spoilers, references=actual_spoilers_squad)
    bleu_eval = bleu.compute(predictions=predicted_texts, references=actual_spoilers)
    
    return [squad_metrics_eval,bleu_eval],actual_spoilers,predicted_texts

In [20]:

model_name = "csarron/bert-base-uncased-squad-v1"

model_name = "deepset/minilm-uncased-squad2"

model_name = "deepset/roberta-base-squad2"

# Para Generation
# config = dict(
# max_length = 512,
# stride = 128,
# n_best = 15,
# max_answer_length = 100,
# batch_size = 8,
# epochs = 20,
# learning_rate = 1e-6,
# model_name = model_name,
# spoiler_type = "passage"
# )

# Phrase Generation
config = dict(
max_length = 512,
stride = 128,
n_best = 25,
max_answer_length = 30,
batch_size = 8,
epochs = 10,
learning_rate = 1e-6,
model_name = model_name,
spoiler_type = "phrase"
)


max_length = config["max_length"]
stride = config["stride"]
n_best = config["n_best"]
max_answer_length = config["max_answer_length"]



tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
model = AutoModelForQuestionAnswering.from_pretrained(config["model_name"])


In [21]:
train_dataset = train_data.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=train_data.column_names,
)

validation_dataset = val_data.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=val_data.column_names,
)

len(train_dataset), len(validation_dataset)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

(2618, 695)

In [22]:
train_dataset.set_format("torch")
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=config["batch_size"],
)

eval_dataloader = DataLoader(
    validation_set, collate_fn=default_data_collator, batch_size=8 
)

optimizer = AdamW(model.parameters(), lr=config["learning_rate"])
accelerator = Accelerator(fp16=True)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

num_train_epochs = config["epochs"]
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
progress_bar = tqdm(range(num_training_steps))
wandb.init(
  project="clickbait_task_2", entity="rajasvi",
  config=config,
)

all_metrics = []
max_bleu = 0.35
for epoch in range(num_train_epochs):
    # Training
    model.train()
    train_loss = 0
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        train_loss+=loss.item()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    

    # Evaluation
    model.eval()
    start_logits = []
    end_logits = []
    accelerator.print("Evaluation!")
    
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
        end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(validation_dataset)]
    end_logits = end_logits[: len(validation_dataset)]

    metrics,theoretical_texts,predicted_texts = compute_metrics(
        start_logits, end_logits, validation_dataset, val_data
    )
    all_metrics.append(metrics)
    wandb.log({
        "epoch":epoch,
        "Train Loss": train_loss,
        "exact_match": metrics[0]['exact_match'],
        "f1": metrics[0]['f1'],
        "bleu": metrics[1]['bleu']        
    })
    if metrics[1]['bleu']>max_bleu:
        model.save_pretrained(f"./models_task2_{spoiler_type}/")
        max_bleu = metrics[1]['bleu']
    print(f"epoch {epoch}:", metrics)
    


  0%|          | 0/6560 [00:00<?, ?it/s]

VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train Loss,█▅▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁
bleu,▁▄▅▆▇▇▇▇▇████▇██████
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
exact_match,▁▄▅▆█▇▇▇████████████
f1,▁▃▅▆▇▇▇▇▇███████████

0,1
Train Loss,766.12697
bleu,0.31384
epoch,19.0
exact_match,14.28571
f1,41.7913


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668914631009103, max=1.0…

Evaluation!


  0%|          | 0/87 [00:00<?, ?it/s]

  0%|          | 0/322 [00:00<?, ?it/s]

epoch 0: [{'exact_match': 8.385093167701863, 'f1': 27.781328788305274}, {'bleu': 0.21098499339957522, 'precisions': [0.31653285756764793, 0.22517176764522173, 0.21264367816091953, 0.2061873487729001], 'brevity_penalty': 0.8923593123789505, 'length_ratio': 0.897757608115323, 'translation_length': 6726, 'reference_length': 7492}]
Evaluation!


  0%|          | 0/87 [00:00<?, ?it/s]

  0%|          | 0/322 [00:00<?, ?it/s]

epoch 1: [{'exact_match': 9.937888198757763, 'f1': 31.328850163220313}, {'bleu': 0.23682079211961637, 'precisions': [0.3136375288387593, 0.22820855614973262, 0.2138268156424581, 0.20552147239263804], 'brevity_penalty': 1.0, 'length_ratio': 1.0413774693005873, 'translation_length': 7802, 'reference_length': 7492}]
Evaluation!


  0%|          | 0/87 [00:00<?, ?it/s]

  0%|          | 0/322 [00:00<?, ?it/s]

epoch 2: [{'exact_match': 11.490683229813664, 'f1': 34.68306952507956}, {'bleu': 0.25529958408122627, 'precisions': [0.3292465182716343, 0.2471840574328506, 0.23289083644799588, 0.22413329750067187], 'brevity_penalty': 1.0, 'length_ratio': 1.1213294180459157, 'translation_length': 8401, 'reference_length': 7492}]


## Alternative Model: Passage Spoiler Generation through Ranking Based Models

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import nltk
from nltk import sent_tokenize
from itertools import chain

nltk.download('punkt')
bleu = evaluate.load("bleu")
squad_metric = evaluate.load("squad")

[nltk_data] Downloading package punkt to /home/rvsharma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [6]:
df_para = df_valid[df_valid.tags=="passage"][["uuid","targetTitle","postText","targetParagraphs","spoiler","tokPos"]].reset_index(drop=True)
df_para["targetParagraphsWithTitle"] = df_para.apply(lambda x: [x['targetTitle']] + x['targetParagraphs'] if x['targetTitle']!=x['postText'] else x['targetParagraphs'],1)
df_para["answers"] = df_para.apply(lambda x: {'text':x['spoiler'], "answer_start":[x['tokPos'][0][0]]},1)
df_para["sents"] = df_para.targetParagraphsWithTitle.apply(lambda x: list(chain.from_iterable([sent_tokenize(y) for y in x])),1)

In [20]:
name = "cross-encoder/ms-marco-TinyBERT-L-2-v2"
# name = "cross-encoder/stsb-TinyBERT-L-4"

model = AutoModelForSequenceClassification.from_pretrained(name)
tokenizer = AutoTokenizer.from_pretrained(name)

best_paras = []
model.eval()

for ind, row in tqdm(df_para.iterrows(), total=len(df_para)):
    paras = row['targetParagraphs']
    query = [row['postText']]*len(paras)
    features = tokenizer(query,paras,  padding=True, truncation=True, return_tensors="pt")
    
    with torch.no_grad():
#         print([x for x in model(**features).logits])
        scores = [x.item() for x in model(**features).logits]
    scores = list(zip(scores,paras))
    scores.sort(reverse=True)
    best_paras.append([x[1] for x in scores[:5]])
#     break
        


  0%|          | 0/322 [00:00<?, ?it/s]

In [21]:
best_sents = []

for i in tqdm(range(len(best_paras))):
    candidates = []
    for p in best_paras[i][:1]:
        candidates+=sent_tokenize(p)
        
    query  = [df_para.iloc[i]["postText"]]*len(candidates)
    features = tokenizer(query, candidates,  padding=True, truncation=True, return_tensors="pt")
    

    with torch.no_grad():
        scores = [x.item() for x in model(**features).logits]
    scores = list(zip(scores, candidates))
    scores.sort(reverse=True)
    best_sents.append(scores[0][1])

    

  0%|          | 0/322 [00:00<?, ?it/s]

In [22]:
theoretical_texts_squad = [{"id": ex["uuid"], "answers": ex["answers"]} for ind,ex in df_para.iterrows()]
predictions_squad = [{"id":df_para.iloc[i]["uuid"],"prediction_text":sent} for i,sent in enumerate(best_sents)]
theoretical_texts = [i['answers']['text'][0] for ind,i in df_para.iterrows()]  

In [23]:
bleu.compute(predictions=best_sents, references=theoretical_texts)

{'bleu': 0.12175138133006554,
 'precisions': [0.22077136984645146,
  0.10944772927412254,
  0.09754818408286352,
  0.09322381930184805],
 'brevity_penalty': 1.0,
 'length_ratio': 1.1039775760811532,
 'translation_length': 8271,
 'reference_length': 7492}

In [24]:
squad_metric.compute(predictions=predictions_squad, references=theoretical_texts_squad)

{'exact_match': 3.7267080745341614, 'f1': 19.911430180638625}