In [2]:
import json
import pandas as pd
import os
import re

path_data = "../../data/caselaw_data/"

main_attributes = json.load(open(path_data + "2301.json", "rb"))

data = {key: [] for key in main_attributes}

for case in os.listdir(path_data):
    file = json.load(open(path_data + case, "rb"))
    for attribute in data.keys():
        data[attribute].append(file[attribute])
    
df = pd.DataFrame(data)
# select only relevant data and drop any data points that have no EU provisions mentioned
relevant_cols = ["title", "summaryEn", "euCaselaw", "euProvisions", "jurisdiction"]
df = df[relevant_cols]
df = df[(df["euProvisions"].str.len() > 0)]
df = df.reset_index(drop = True)

def get_citation_to_eu_instrument(item):
    complete_citation = [mention["label"] + "_" + mention["value"] for mention in item["itemsBase"] if mention["value"] != ""]
    return item["celex"] + "." + ".".join(complete_citation) if len(complete_citation) > 0 else item["celex"]

# denest the jurisdiction, CELEX, and EU citations
df["jurisdiction"] = df["jurisdiction"].apply(lambda x: x["label"])
df["celex"] = df["euProvisions"].apply(lambda x: set([citation["celex"] for citation in x]))
df["citation_all"] = df["euProvisions"].apply(lambda x: set([get_citation_to_eu_instrument(citation) for citation in x]))
df["citation_article"] = df["euProvisions"].apply(lambda x: set([f"{citation['celex']}.{citation['itemsBase'][0]['label']}_{citation['itemsBase'][0]['value']}" if citation['itemsBase'][0]['value'] != "" else citation['celex']  for citation in x]))

# drop EU provisions, as all information was extracted from it
df = df.drop(["euProvisions"], axis = 1)

# sanitise summaries
df["summaryEn"] = df["summaryEn"].apply(lambda x: re.sub(r"(?:https://)?www.[^\s<]+", "", x)) # remove any potential links
df["summaryEn"] = df["summaryEn"].apply(lambda x: re.sub(r"<.*?>", "", x)) # remove html elements
df["summaryEn"] = df["summaryEn"].apply(lambda x: re.sub(r"&nbsp;", "", x)) # remove html elements
df.head()

Unnamed: 0,title,summaryEn,euCaselaw,jurisdiction,celex,citation_all,citation_article
0,"VSRH, Kž eun 27/2017-4",The case concerns the crime of fraud committed...,[],Croatia,{32002F0584},{32002F0584.Article_8.Paragraph_1.Point_c},{32002F0584.Article_8}
1,"Rechtbank Amsterdam, 11-06-2020, ECLI:NL:RBAMS...",The case concerns the crime of [assault.] prov...,"[{'celex': '62016CJ0367', 'name': 'Judgment of...",Netherlands,{32002F0584},"{32002F0584.Article_3.Paragraph_3, 32002F0584....","{32002F0584.Article_3, 32002F0584.Article_2}"
2,Wyrok Sądu Najwyższego z dnia 4 lipca 2013 r. ...,The application of detention on remand in the ...,[],Poland,{32002F0584},{32002F0584.Article_27},{32002F0584.Article_27}
3,"Rechtbank Amsterdam, 14-09-2023, ECLI:NL:RBAMS...",The case concerns the crime of [unknown] provi...,[],Netherlands,{32018R1805},"{32018R1805.Article_4.Paragraph_1, 32018R1805....","{32018R1805.Article_4, 32018R1805.Article_8}"
4,Juzgado Central de Instrucción núm. 4. Auto 88...,The case concerns the crimes of The case conce...,[],Spain,{32002F0584},{32002F0584},{32002F0584}


In [31]:
data_json = [{"summaryEn": summary, "jurisdiction": jurisdiction, "euProvisions": list(provision)[0]} for (summary, jurisdiction, provision) in zip(df["summaryEn"], df["jurisdiction"], df["celex"])]
json.dump(data_json, open("corpus.json", "w"), indent = 2)