In [9]:
import json
import os
import pandas as pd
import re

path_data = "../data/caselaw_data/"

main_attributes = json.load(open(path_data + "2301.json", "rb"))
os.mkdir("../data/caselaw_emb/") if "caselaw_emb" not in os.listdir("../data/") else 1

data = {key: [] for key in main_attributes}

for case in os.listdir(path_data):
    file = json.load(open(path_data + case, "rb"))
    for attribute in data.keys():
        data[attribute].append(file[attribute])
    
df = pd.DataFrame(data)
df["jurisdiction"] = df["jurisdiction"].apply(lambda x: x["label"])
relevant_cols = ["title", "summaryEn", "euCaselaw", "euProvisions", "jurisdiction"]
df = df[relevant_cols]
df = df[(df["euProvisions"].str.len() > 0)]
df = df.reset_index(drop = True)
df["summaryEn"] = df["summaryEn"].apply(lambda x: re.sub(r"<.*?>", "", x)) # remove html elements
df["summaryEn"] = df["summaryEn"].apply(lambda x: re.sub(r"&nbsp;", "", x)) # remove html elements
df.head()

Unnamed: 0,title,summaryEn,euCaselaw,euProvisions,jurisdiction
0,"VSRH, Kž eun 27/2017-4",The case concerns the crime of fraud committed...,[],"[{'celex': '32002F0584', 'name': '2002/584/JHA...",Croatia
1,"Rechtbank Amsterdam, 11-06-2020, ECLI:NL:RBAMS...",The case concerns the crime of [assault.] prov...,"[{'celex': '62016CJ0367', 'name': 'Judgment of...","[{'celex': '32002F0584', 'name': '2002/584/JHA...",Netherlands
2,Wyrok Sądu Najwyższego z dnia 4 lipca 2013 r. ...,The application of detention on remand in the ...,[],"[{'celex': '32002F0584', 'name': '2002/584/JHA...",Poland
3,"Rechtbank Amsterdam, 14-09-2023, ECLI:NL:RBAMS...",The case concerns the crime of [unknown] provi...,[],"[{'celex': '32018R1805', 'name': 'Regulation (...",Netherlands
4,Juzgado Central de Instrucción núm. 4. Auto 88...,The case concerns the crimes of The case conce...,[],"[{'celex': '32002F0584', 'name': '2002/584/JHA...",Spain


In [10]:
import numpy as np
import torch

from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, BertModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [11]:
# !rm -r ../data/caselaw_emb/

In [12]:
def pool_embeddings(data, tokenized, pad_tok_id):
  if "attention_mask" in tokenized:
    attention_mask = tokenized["attention_mask"]
  else: # apparently ErnieM does NOT have attenion IDs in the tokenized output, so I am "computing" them myself - like in all other models, the model should not pay attention to [PAD] tokens, so they are ignored/not paid attention to
    print("Oh no")
    token_ids = tokenized["input_ids"][0]
    padding_ids = len([tok for tok in token_ids if tok == pad_tok_id]) # count how many [PAD] tokens there are
    attention_mask = torch.ones((tokenized["input_ids"].shape)).to(device)
    if padding_ids > 0:
        attention_mask[:,-padding_ids:] = 0
    attention_mask = torch.tensor(attention_mask).to(device)
    
  attention_expanded = attention_mask.unsqueeze(-1).expand(data.size()).float()
  data_attention = data * attention_expanded
  return torch.sum(data_attention, 1) / torch.clamp(attention_expanded.sum(1), min=1e-9) # to not divide by 0

def encode_sentence_bert(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length", return_attention_mask=True)
    
    pad_tok_id = tokenizer("[PAD]")
    pad_tok_id = pad_tok_id["input_ids"][1]
    
    aux = {}
    for key in inputs.keys(): # cast tokenized input to GPU
      aux[key] = inputs[key].to(device)
    inputs = aux

    outputs = model(**inputs)

    outputs = pool_embeddings(outputs[0], inputs, pad_tok_id)

    last_hidden_state = outputs.cpu().detach().numpy()  # The last hidden-state is the first element of the output tuple

    return last_hidden_state[0]

In [18]:
import spacy
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
lemmatizer = spacy.load("en_core_web_sm")

In [31]:
import pickle
import string

from nltk.tokenize import word_tokenize
import fasttext

from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

def get_sentence_embedding_fasttext(tokens, model):
    aux = []
    for token in tokens:
        aux.append(model.get_word_vector(token))
    
    aux = np.asarray(aux)

    return np.mean(aux, axis = 0)


def get_embedding_all_fasttext(data, model):
    match model:
        case "facilex":
            if "fasttext_facilex.bin" in os.listdir("../models/"):
                fasttext_model = fasttext.load_model(f"../models/fasttext_{model}.bin")
            else:
                fasttext_model = train_fasttext()
        case _:
            fasttext_model = fasttext.load_model(f"../models/{model}.bin")
            
    data["summaryEn"] = data["summaryEn"].apply(summary_preprocessing)
    data["summaryEn"] = data["summaryEn"].str.split(" ")
    all_vectors = data["summaryEn"].apply(get_sentence_embedding_fasttext, args = (fasttext_model, ))        

    return np.asarray(all_vectors.tolist())

def train_fasttext():
    with open("processed_text_fasttext.txt", "w") as file:
        for text in df["summaryEn"].apply(summary_preprocessing):
            file.write(text)
            file.write("\n")
    fasttext_model = fasttext.train_unsupervised("processed_text_fasttext.txt", dim = 500, epoch = 20)
    fasttext_model.save_model("../models/fasttext_facilex.bin")

    return fasttext_model

def summary_preprocessing(text):
    text = re.sub(r"[^\w ]+", " ", text)
    text = re.sub(r" {2,}", " ", text)
    text = text.strip(string.punctuation)
    text = text.lower()
    text = lemmatizer(text)
    text = [token.lemma_ for token in text]
    text = " ".join([word for word in text if word not in stop_words])

    return text

def summary_preprocessing_transformer(text):
    text = re.sub(r"[^\w ]+", " ", text)
    text = re.sub(r" {2,}", " ", text)
    text = text.strip(string.punctuation)
    text = text.lower()

    return text

all_tfidf = TfidfVectorizer(binary = True, ngram_range = (1,2), tokenizer = word_tokenize, lowercase = True)
all_tfidf.fit(np.asarray(df["summaryEn"].apply(summary_preprocessing).tolist()))

def get_sentence_embedding_tfidf(data):
    data["summaryEn"] = data["summaryEn"].apply(summary_preprocessing)
    all_vectors = np.asarray(all_tfidf.transform(np.asarray(data["summaryEn"].tolist())).toarray())

    return all_vectors

# train_fasttext()

In [14]:
# !rm -r ../data/caselaw_emb/

In [32]:
embd_name_map = {
    frozenset([get_sentence_embedding_tfidf]): "tfidf", 
    frozenset([get_embedding_all_fasttext, "facilex"]): "fasttext_facilex",
    frozenset([get_embedding_all_fasttext, "cc.en.300"]): "fasttext_ccen"
}

for jurisdiction in tqdm(df["jurisdiction"].unique()):
    for embedding_method in [get_sentence_embedding_tfidf, [get_embedding_all_fasttext, "facilex"], [get_embedding_all_fasttext, "cc.en.300"]][:1]:
        aux = df.copy(True)
        if not type(embedding_method) == list:
            emb_vec = embedding_method(aux[aux["jurisdiction"] == jurisdiction])
        else:
            emb_vec = embedding_method[0](aux[aux["jurisdiction"] == jurisdiction], embedding_method[1])

        os.mkdir(f"../data/caselaw_emb/{jurisdiction}") if jurisdiction not in os.listdir("../data/caselaw_emb/") else 1
        pickle.dump(emb_vec, open(f"../data/caselaw_emb/{jurisdiction}/emb_{embd_name_map[frozenset([embedding_method] if type(embedding_method) != list else embedding_method)]}.pickle", "wb"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["summaryEn"] = data["summaryEn"].apply(summary_preprocessing)
100%|██████████| 8/8 [00:07<00:00,  1.07it/s]


In [16]:
from tqdm import tqdm

models = ["distiluse-base-multilingual-cased-v2", "paraphrase-multilingual-mpnet-base-v2", "bert-base-uncased", "multi-qa-mpnet-base-dot-v1", "all-mpnet-base-v2"]
model_name_map = {
    "distiluse-base-multilingual-cased-v2": "multi_distiluse", 
    "paraphrase-multilingual-mpnet-base-v2": "multi_mpnet", 
    "bert-base-uncased": "bert_uncased", 
    "multi-qa-mpnet-base-dot-v1": "multiqa_mpnet_dot", 
    "all-mpnet-base-v2": "mpnet"
}

for jurisdiction in tqdm(df["jurisdiction"].unique()):
    for model_name in models:
        aux = df.copy(True)
        aux["summaryEn"] = aux["summaryEn"].apply(summary_preprocessing_transformer)
        if "bert" not in model_name:
            model = SentenceTransformer("../models/" + model_name).to(device)
            emb_vec = np.asarray(aux[aux["jurisdiction"] == jurisdiction]["summaryEn"].apply(model.encode).tolist())
        else:
            model = BertModel.from_pretrained("../models/" + model_name).to(device)
            tokenizer = BertTokenizer.from_pretrained("../models/" + model_name)

            emb_vec = np.asarray(aux[aux["jurisdiction"] == jurisdiction]["summaryEn"].apply(encode_sentence_bert, args = (model, tokenizer,)).tolist())

        os.mkdir(f"../data/caselaw_emb/{jurisdiction}") if jurisdiction not in os.listdir("../data/caselaw_emb/") else 1
        aux_df = df[df["jurisdiction"] == jurisdiction].copy(True)
        aux_df[f"embedding_{model_name}"] = list(emb_vec)
        aux_df.to_pickle(f"../data/caselaw_emb/{jurisdiction}/emb_{model_name_map[model_name]}.pickle")


100%|██████████| 8/8 [05:22<00:00, 40.37s/it]
