In [2]:
from datasets import load_dataset
import numpy as np

multi_lexsum = load_dataset("allenai/multi_lexsum", name="v20230518")
modified_dataset = multi_lexsum["test"].filter(lambda x: x["summary/short"] != None)

In [3]:
import tiktoken
import torch
import pytextrank

import spacy

from tqdm import tqdm

# To get the tokeniser corresponding to a specific model in the OpenAI API:
tokenizer = tiktoken.encoding_for_model("gpt-3.5")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank", last = True)
nlp.max_length = 3_000_000


/home/keddie/anaconda3/envs/facilex_caselaw/lib/python3.11/site-packages


In [4]:
# user_prompt = "Summarize concisely the following legal texts. Include as many relevant facts as possible. A fact is relevant if it mentions plaintiffs, counsel, type of action, filling date, name of the court, description of class, defendants, statuatory basis, rought remedy, judges, consolidated class, whether it is a class action, date of decree, citations, duration of decrees, last action in case."
system_prompt = "You are a legal expert. You must answer concisely and truthfully, including only information that is relevant to the conversation. Stay faithful to the original text and keep the exact wording as found in the text as closely as possible. Only include facts relevant to the text, without any filler words."
user_prompt = "Summarize concisely the following legal texts."

max_context_length = 16384
max_output_length = 130
length_user_prompt = len(tokenizer.encode(user_prompt))
length_system_prompt = len(tokenizer.encode(system_prompt))
print(length_user_prompt)
print(length_system_prompt)

11
60


In [5]:
max_id = np.argmax([len(a) for a in modified_dataset["sources"]])
len(modified_dataset["sources"][max_id])

101

In [14]:
def get_extractive_summary(parsed_doc, limit_phrases = 4, limit_sentences = 4):
    sentence_bounds = [[sentence.start, sentence.end, set([])] for sentence in parsed_doc.sents]

    phrase_id = 0
    unit_vector = []
    # get original text according to rank
    for p in parsed_doc._.phrases:
        unit_vector.append(p.rank)

        for chunk in p.chunks:
            for sent_start, sent_end, sent_vector in sentence_bounds:
                if chunk.start >= sent_start and chunk.end <= sent_end:
                    sent_vector.add(phrase_id)
                    break

        phrase_id += 1

        if limit_phrases and phrase_id >= limit_phrases:
            break

    # euclidean distance between phrases, choose those with smallest distance
    unit_vector = np.asarray(unit_vector)
    sum_ranks = np.sum(unit_vector)
    unit_vector /= sum_ranks 
    sent_rank = {}
    sent_id = 0
    for sent_start, sent_end, sent_vector in sentence_bounds:
        sum_sq = 0
        # only add to sum if phrase id not in sent vector so a phrase will not count itself
        sum_sq = np.sqrt(np.sum([unit_vector[phrase_id]*unit_vector[phrase_id] for phrase_id in range(len(unit_vector)) if phrase_id not in sent_vector]))
        sent_rank[sent_id] = sum_sq
        sent_id += 1

    sent_rank = dict(sorted(sent_rank.items(), key = lambda x: x[1]))

    sent_id = 0
    sent_text = {}
    for sentence in parsed_doc.sents:
        sent_text[sent_id] = sentence.text
        sent_id += 1

    limit = 0
    summary = []
    for id_sentence in sent_rank.keys():
        summary.append(sent_text[id_sentence])
        limit += 1
        if limit > limit_sentences:
            break

    return summary[:-1]

In [12]:
from collections import defaultdict

np.random.seed(42)
# build in layers, from smallest number of doc types to largest
def select_docs(docket, docket_metadata, limit_docket_docs = 10):
    doc_types = defaultdict(list)
    for doc, doc_type in zip(docket, docket_metadata["doc_type"]):
        doc_types[doc_type].append(doc)

    # first pass, add at least 1 doc type 
    limit_counter = 0
    docs = []
    aux = doc_types.copy()
    for doc_type, documents in doc_types.items():
        if len(documents) == 1:
            docs.append(documents[0])
            aux.pop(doc_type)
        else:
            random_idx = np.random.randint(0, len(documents))
            docs.append(documents[random_idx])
            documents.pop(random_idx)
        limit_counter += 1

    while limit_counter < limit_docket_docs and aux.keys():
        # compute softmax prob weights to perform weighted sampling, by choosing least present documents preponderentely
        prob_weights = [1/len(documents) for documents in aux.values()]
        prob_weights = np.asarray(prob_weights)/np.sum(prob_weights)
        random_key = np.random.choice(list(aux.keys()), p = prob_weights)

        # uniform sampling across documents from the chosen key
        random_idx = np.random.randint(0, len(aux[random_key]))
        docs.append(aux[random_key][random_idx])

        # remove doc so it won't be present in future sampling
        aux[random_key].pop(random_idx)

        # if there are no more documents to this key, remove the key entirely
        if len(aux[random_key]) == 0:
            aux.pop(random_key)

        limit_counter += 1

    return docs

In [16]:
import json

with open("extracted.txt", "a") as file:
    limit_docket = len(modified_dataset)
    limit_docket_docs = 10
    iterator = zip(modified_dataset["sources"][:limit_docket], modified_dataset["sources_metadata"][:limit_docket])
    for docket_id, (docket, docket_metadata) in enumerate(tqdm(iterator, total = limit_docket)):
        documents = select_docs(docket, docket_metadata, limit_docket_docs = limit_docket_docs)

        summaries = []
        for doc in documents:
            summary_aux = get_extractive_summary(nlp(doc), limit_phrases = None, limit_sentences = 5)
            summaries.append(summary_aux)
        json.dump(summaries, open(f"extracted_sums/extracted_sums_json_{'random_selection'}/{docket_id}.json", "w"), indent = 2)

100%|██████████| 616/616 [2:02:58<00:00, 11.98s/it]  


In [24]:
json.load(open("extracted.txt", "r"))

{'0': ['On top of being unlawfully imprisoned for failure to pay debts owed to the City, Plaintiffs Mitchell and Williams were, pursuant to City policy, coerced with longer unlawful prison terms by City officials if they did not "volunteer" to labor in the City jail under disgusting conditions for an extra credit of $25 per day toward their debts .. It is the policy and practice of the City to tell inmates that their time in City jail can be further reduced if they agree to "work oft" their debts to the City While in jail by laboring at janitorial and other work for the City at a rate of $25 per day toward their debts.\n Not only does the City charge additional fees only to those people who are indigent and cannot afford to pay their costs, fees, and fines in full immediately and place such indigent people on "pay only" probation when the cases of wealthier people would be closed, but by imposing imprisonment, threats of imprisonment, indeterminate "probation," and other restrictions o