In [7]:
from datasets import load_dataset
import numpy as np

multi_lexsum = load_dataset("allenai/multi_lexsum", name="v20230518")
modified_dataset = multi_lexsum["test"].filter(lambda x: x["summary/short"] != None)

In [8]:
import torch
from summarizer import Summarizer

from tqdm import tqdm
# from summarizer.sbert import SBertSummarizer
# model_summ = SBertSummarizer("paraphrase-MiniLM-L6-v2")
# To get the tokeniser corresponding to a specific model in the OpenAI API:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_summ = Summarizer("distilbert-base-uncased", hidden_concat = True, hidden = [-1, -2], gpu_id = 0)


In [9]:
import warnings

warnings.filterwarnings("ignore")

def get_extractive_summary(doc, limit_sentences = 10):
    return model_summ(doc, use_first = False, return_as_list = True, num_sentences = limit_sentences) 

In [10]:
# select first 5 and last 5 docs
def select_docs(docket, limit_docket_docs = 10):
    half_docs = limit_docket_docs // 2

    if len(docket) > 10:
        subset_docs = docket[:half_docs] + docket[-half_docs:]
    else:
        subset_docs = docket

    return subset_docs

In [11]:
import json
import os

limit_docket = len(modified_dataset)
limit_docket_docs = 10
iterator = zip(modified_dataset["sources"][:limit_docket], modified_dataset["sources_metadata"][:limit_docket])
path = f"extracted_sums/extracted_sums_json_{'first5last5'}_bert"
if path.split("/")[-1] not in os.listdir("extracted_sums/"):
    os.mkdir(path)
for docket_id, (docket, docket_metadata) in enumerate(tqdm(iterator, total = limit_docket)):
    if f"{docket_id}.json" in os.listdir(path):
        continue

    documents = select_docs(docket, limit_docket_docs = limit_docket_docs)

    summaries = []
    for doc in documents:
        summary_aux = get_extractive_summary(doc, limit_sentences = 10)
        summaries.append(summary_aux)
    json.dump(summaries, open(f"{path}/{docket_id}.json", "w"), indent = 2)

100%|██████████| 616/616 [3:04:09<00:00, 17.94s/it]  
