In [1]:
import json
import re
import numpy as np
import torch

from sentence_transformers import SentenceTransformer

In [2]:
# assumption that data contains non-relevant elements of code/URL
def sanitize_data(text: str) -> str:
    text = re.sub(r"(?:https://)?www.[^\s<]+", "", text) # remove potential websites
    text = re.sub(r"<.*?>", "", text) # remove HTML elements
    text = re.sub(r"&nbsp;", "", text) # remove leftover formatting
    text = text.strip()

    return text

In [5]:
device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # for embedding using transformers
embedding_model: SentenceTransformer = SentenceTransformer("../../models/multi-qa-mpnet-base-dot-v1", device = device).half() # fp16

# uniqueId
# summaryEn
# euProvisions
# jurisdiction

corpus_data: dict = json.load(open("corpus.json", "r")) # json of all available cases on which to search for similar ones

for search_text in corpus_data:
    search_text["summaryEn"] = sanitize_data(search_text["summaryEn"])

search_texts = [search_space_item["summaryEn"] for search_space_item in corpus_data]

search_embedded: np.ndarray = embedding_model.encode(search_texts)
output_data = [{"uniqueId": search_item["uniqueId"], "embedding": embedding.tolist()} for search_item, embedding in zip(corpus_data, search_embedded)]

json.dump(output_data, open("corpus_embedded.json", "w"), indent = 2)