In [1]:
import numpy as np
import torch

from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
!mkdir model
%cd model
!git clone https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2
!git clone https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2
%cd ..

mkdir: cannot create directory ‘model’: File exists
/home/keddie/Desktop/job_nlp/FACILEX/case_law/model
fatal: destination path 'paraphrase-multilingual-mpnet-base-v2' already exists and is not an empty directory.
fatal: destination path 'distiluse-base-multilingual-cased-v2' already exists and is not an empty directory.
/home/keddie/Desktop/job_nlp/FACILEX/case_law


In [3]:
import pandas as pd
import pickle
import os
import re

from nltk.corpus import stopwords

data_path = "CJEU/inputdata/full_texts_all_cases/"
stop_words = pickle.load(open("stopwords.pickle", "rb"))
stop_words.extend(stopwords.words("english"))

def preprocessing(text: str):
    text = text.lower()
    text = re.sub(r"[\n]+", "", text)
    text = re.sub(r" {2,}", " ", text)

    for stop_token in stop_words:
        text = re.sub(" " + stop_token + r"[,.; ]", " ", text)
    text = re.sub(r"[+]+", "", text)

    return text

texts = []
file_dict = {}
for case in os.listdir(data_path):
    #  if case.split("_")[-1][:-4] in sampled_idx:
    texts.append(open(data_path + case).read())
    file_dict[case.split("_")[-1][:-4]] = texts[-1]

values = []
key = {}
counter = 0
for k,v in file_dict.items():
    values.append(v)
    key[k] = counter
    counter+=1

df = pd.DataFrame({"text": texts})
df["text"] = df["text"].apply(preprocessing)
df.head()

Unnamed: 0,text
0,case c-543/deutsche telekom agvbundesrepublik ...
1,avis juridique important|61997jjudgment septem...
2,avis juridique important|61982jjudgment (fourt...
3,avis juridique important|61973jjudgment may 19...
4,avis juridique important|61995jjudgment (sixth...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer

def tfidf_encoding(data, test_data):
    stemmer = PorterStemmer()

    def tfidf_processing(text: str):
        final_text = ""
        for sentence in sent_tokenize(text):
            tokens = word_tokenize(sentence)
            tokens = [stemmer.stem(token) for token in tokens]
            final_text += " ".join(tokens) + " "
        return final_text

    text = data["text"].apply(tfidf_processing)

    tfidf_model = TfidfVectorizer(use_idf = True, stop_words = stop_words)
    tfidf_data = tfidf_model.fit_transform(text)

    text = test_data["text"].apply(tfidf_processing)
    test_data = tfidf_model.transform(text)

    return tfidf_data, test_data

In [5]:
from sentence_transformers import SentenceTransformer

In [6]:
import sys

def encode_sentence(sentence: str) -> torch.tensor:
    global idx
    sys.stdout.write("\r" + f"{idx}/{len(df)}")
    idx += 1
    global model

    return model.encode(sentence)

def trans_encode(data, test_data):
    return data["text"].apply(encode_sentence).to_list(), test_data["text"].apply(encode_sentence).to_list()

In [7]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import scipy as sp

def get_sample_cases(topic):
    data = pd.read_csv("CJEU/inputdata/sampled_cases.csv")
    relevant_rows = data[data['source_case_topic'] == topic]
    return relevant_rows['source'].tolist()

def find_similar(tfidf_matrix, index, top_n):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]


print("* Importing sample cases...")
# Celex numbers of reference cases
publichealth = get_sample_cases('public health')
socialpolicy = get_sample_cases('social policy')
dataprotection = get_sample_cases('data protection')

* Importing sample cases...


In [8]:
test_data = publichealth + socialpolicy + dataprotection

texts = []
file_dict = {}
for case in os.listdir(data_path):
    texts.append(open(data_path + case).read())
    file_dict[case.split("_")[-1][:-4]] = texts[-1]

texts = [text for case_id, text  in file_dict.items() if case_id in test_data]
test_df = pd.DataFrame({"text": texts})
test_df["text"] = test_df["text"].apply(preprocessing)
test_df

Unnamed: 0,text
0,case c-543/deutsche telekom agvbundesrepublik ...
1,judgment (tenth chamber)october (*)‛protection...
2,\t\t\t\tarrêt de la cour \t\t\t c-135/commiss...
3,judgment (third chamber)june (*)‛reference — p...
4,judgment (first chamber)october (*)(appeal — r...
...,...
172,avis juridique important|61993jjudgment februa...
173,order april – vischim v commission(case c-459/...
174,avis juridique important|61984jjudgment april ...
175,judgment (sixth chamber)march (*)‛social polic...


In [9]:
print("* Import citations for cases...")
citations = pd.read_csv('CJEU/inputdata/all_cases_citations.csv')
print(" Successfully imported citations!")

def find_cited_cases(celexnumber):
    global citations
    relevantsource = citations[citations['source'] == celexnumber]
    return relevantsource['target'].tolist()

def exists_citation_link_between(celexnumber1,celexnumber2):
    global citations
    relevantsource1 = citations[citations['source'] == celexnumber1]
    relevantsource2 = citations[citations['source'] == celexnumber2]
    if celexnumber2 in relevantsource1['target'].tolist() or celexnumber1 in relevantsource2['target'].tolist():
        return 1
    return 0

* Import citations for cases...
 Successfully imported citations!


In [18]:
dict_res = {}
models = ["distiluse-base-multilingual-cased-v2", "paraphrase-multilingual-mpnet-base-v2"]
encoded_dict = {}

# Keep a record of document to index
def get_doc_index(docid):
    global key
    rowid = key[docid]
    return rowid
    
# Keep a record of document to index
def get_doc_row(docid, data):
    global key
    rowid = key[docid]
    row = data[rowid,:]
    return row

# Keep a record of document to index
def get_doc_id(rowid):
    global key
    for k, v in key.items():    
        if v == rowid:
            return k
    return -1

# Function to convert entire similarity results to case ID references
def convert_to_case_references(tfidf_result):
    result = []
    for item in tfidf_result:
        case_reference = get_doc_id(item[0]) # convert to case reference
        similarity_value = item[1]
        result.append((case_reference,similarity_value))
    return result

def lookup_similar_cases(sample_cases, n, topic, data):
    global results
    for item in sample_cases:
        index = get_doc_index(item)                         # Look up this cases index in the TFIDF matrix
        similar_cases = find_similar(data, index, n)  # Look up top n similar cases for this case
        similar_cases_references = convert_to_case_references(similar_cases)
        for reference in similar_cases_references:
            results.append([item,reference[0],reference[1],'tfidf',exists_citation_link_between(item,reference[0]),topic])

for encoding_method in ["tfidf", "distiluse", "mpnet"]:
# for encoding_method in ["tfidf"]:
    results = []
    idx = 0

    data = None
    if encoding_method == "tfidf":
        encoding_func = tfidf_encoding
    elif encoding_method == "distiluse":
        model_path = f"model/{models[0]}"
        model = SentenceTransformer(model_path).to(device)
        encoding_func = trans_encode

    else:
        model_path = f"model/{models[1]}"
        model = SentenceTransformer(model_path).to(device)
        encoding_func = trans_encode

    print(encoding_func.__name__)
    data, test_data = encoding_func(df, test_df)
    encoded_dict[encoding_method] = (data, test_data)

    # print("* Computing similar cases...")
    # # 1. Public Health
    # lookup_similar_cases(publichealth,20,'public health', data)
    # # # 2. Social Policy
    # lookup_similar_cases(socialpolicy,20,'social policy', data)
    # # # 3. Data Protection
    # lookup_similar_cases(dataprotection,20,'data protection', data)

    # dict_res[encoding_method] = results

tfidf_encoding
trans_encode
14004/13828trans_encode
14004/13828

In [27]:
all_results = []
models = ["distiluse-base-multilingual-cased-v2", "paraphrase-multilingual-mpnet-base-v2"]

# Keep a record of document to index
def get_doc_index(docid):
    global key
    rowid = key[docid]
    return rowid
    
# Keep a record of document to index
def get_doc_row(docid, data):
    global key
    rowid = key[docid]
    row = data[rowid,:]
    return row

# Keep a record of document to index
def get_doc_id(rowid):
    global key
    for k, v in key.items():    
        if v == rowid:
            return k
    return -1

# Function to convert entire similarity results to case ID references
def convert_to_case_references(tfidf_result):
    result = []
    for item in tfidf_result:
        case_reference = get_doc_id(item[0]) # convert to case reference
        similarity_value = item[1]
        result.append((case_reference,similarity_value))
    return result

def lookup_similar_cases(sample_cases, n, topic, data, method):
    global results
    for item in sample_cases:
        index = get_doc_index(item)                         # Look up this cases index in the TFIDF matrix
        similar_cases = find_similar(data, index, n)  # Look up top n similar cases for this case
        similar_cases_references = convert_to_case_references(similar_cases)
        for reference in similar_cases_references:
            results.append([item,reference[0],reference[1],method,exists_citation_link_between(item,reference[0]),topic])

for encoding_method in ["tfidf", "distiluse", "mpnet"]:
# for encoding_method in ["tfidf"]:
    results = []
    idx = 0

    (data, test_data) = encoded_dict[encoding_method]

    print("* Computing similar cases...")
    # 1. Public Health
    lookup_similar_cases(publichealth,20,'public health', data, encoding_method)
    # # 2. Social Policy
    lookup_similar_cases(socialpolicy,20,'social policy', data, encoding_method)
    # # 3. Data Protection
    lookup_similar_cases(dataprotection,20,'data protection', data, encoding_method)

    all_results += results

* Computing similar cases...
* Computing similar cases...
* Computing similar cases...


In [34]:
dict_res = {column: np.asarray(all_results)[:, idx] for column, idx in zip(["source_case","similar_case","similarity_score","method","citation_link","source_case_topic"], range(0,6))}
df = pd.DataFrame(dict_res)
df.to_csv("results.csv", index = False)
df.head()

Unnamed: 0,source_case,similar_case,similarity_score,method,citation_link,source_case_topic
0,62003CJ0453,62006CO0421,0.7273723559334702,tfidf,1,public health
1,62003CJ0453,61984CJ0028,0.6648683721282975,tfidf,0,public health
2,62003CJ0453,61984CJ0195,0.6189330729443988,tfidf,0,public health
3,62003CJ0453,61990CJ0039,0.5797361556748202,tfidf,0,public health
4,62003CJ0453,62002CJ0145,0.5667513340809365,tfidf,0,public health


In [8]:
def rank(val,a):
    if sp.sparse.issparse(a):
        return a[a>=val].shape[1] #if a is sparse
    return len(a[a>=val]) # if a is dense

def reciprocal_rank(l1_vecs, l2_vecs):
    '''Mean reciprocal rank'''
    if torch.is_tensor(l1_vecs):
        l1_vecs, l2_vecs = l1_vecs.cpu().detach().numpy(), l2_vecs.cpu().detach().numpy()
        
    sim = cosine_similarity(l1_vecs, l2_vecs)

    return sum([1/rank(sim[i,:sim.shape[1]],sim[i]) for i in range(sim.shape[0])])/sim.shape[0]


for encoding_method, (data, test_data) in encoded_dict.items():
    print(reciprocal_rank(data, test_data))

0.005649717514123272


In [None]:
pickle.dump(dict_res, open("res.pickle", "wb"))

In [9]:
np.sum(np.asarray(results)[:, 4].astype(int))

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [None]:
#### MEAN-POOLING WITH ATTENTION FOR NON-SENTENCE TRANSFORMERS
# from transformers import BatchEncoding

# tokenizer = AutoTokenizer.from_pretrained(model_path, fast_tokenizer = True)
# model = AutoModel.from_pretrained(model_path).to(device)

# def encode_sentence_transformer(sentence: list) -> np.ndarray:
#     """
#     Given a list of tokens, compute its dense vector using a BERT model. The resulting 512 tokens are mean-pooled, 
#     taking into consideration their presence in the attention mask.
#     """
#     sentence = " ".join(sentence)
#     tokenized_sentence = tokenizer(sentence, max_length = 512, padding = "max_length", truncation = True, return_tensors = "pt", return_attention_mask = True)
    
#     # cast all tensors to the same device -> speeds up prediction time and any subsequent computations
#     aux = {}
#     for key, value in tokenized_sentence.items():
#         aux[key] = value.to(device)
#     tokenized_sentence = BatchEncoding(aux)
    
#     model_output = model(**tokenized_sentence)

#     # pool the values of the last hidden state using a weighted averaged that ignores all tokens with 0 in the attention mask
#     last_hidden_state = model_output.last_hidden_state[0]
#     attention_mask = tokenized_sentence.attention_mask[0]
#     attention_mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
#     data_attention = last_hidden_state * attention_mask
#     sentence_vector = torch.sum(data_attention, 0)/torch.clamp(attention_mask.sum(0), min = 1e-9)

#     return np.array(sentence_vector.cpu().detach().numpy())
