In [None]:
# https://linkeddata.overheid.nl/front/portal/document-viewer?ext-id=ECLI:NL:CBB:2021:1
# ECLI:NL:CBB:2021:1
# https://www.geeksforgeeks.org/reading-and-writing-xml-files-in-python/

# Data Collection

## Imports 

In [40]:
from bs4 import BeautifulSoup 
from datetime import datetime
from zipfile import ZipFile 
import glob
import pandas as pd
from tqdm.notebook import trange, tqdm
import os
import logging

pd.set_option('display.max_columns', 500)
pd.set_option('max_colwidth', 400)
pd.options.display.width=None

# Specify the root folder for data files
root = '/content/drive/MyDrive/Universiteit Utrecht/Business Informatics/Thesis Project/Implementation/'

data_folder = root + 'Data/Cases per Year'

# Note: for some reason drive won't add/update the log file untill the runtime
# has been reset (or closed I suppose)
logging.basicConfig(filename=root+'data_collection.log',
                    format='%(asctime)s - %(message)s',
                    datefmt='%d-%m-%Y %H:%M:%S', 
                    level=logging.INFO)

## Helper functions

In [41]:
def parse_xml(file_name):  
  # Passing the stored data inside the beautifulsoup parser
  file_xml = BeautifulSoup(file_name, "xml") #(data, "xml") 

  # Find the RDF info, case summary and case description 
  case_rdf = file_xml.find('rdf:RDF') 
  case_summary = file_xml.find('inhoudsindicatie') 
  case_description = file_xml.find('uitspraak')

  # For some cases the term 'conclusie' is used instead of 'uitspraak'
  if case_description == None:
    case_description = file_xml.find('conclusie')

  return (case_rdf, case_summary, case_description) 

In [42]:
def get_document_attributes(case_rdf, file_name):
  rdf_tags = {}

  # Tag pointer dictionary for single tag items
  # Cardinalities are taken from chapter 12 of the pdf:
  # Technische-documentatie-Open-Data-van-de-Rechtspraak.pdf
  # Cardinality: 0/1 - 1
  tag_pointer_dict = {
      'identifier':'dcterms:identifier', # 1 - 1
      #'format':'dcterms:format', # 1 - 1 
      'seat_location':'dcterms:spatial', # 0 - 1
      'publisher':'dcterms:publisher', # 1 - 1
      #'language':'dcterms:language', # 1 - 1
      'creator':'dcterms:creator', # 1 - 1
      'case_type':'dcterms:type' # 1 - 1    
  }

  # Tag pointer dictionary for multi tag items
  # Cardinality 0/1 - many
  multi_tag_pointer_dict = {
      'jurisdiction':'dcterms:subject', # 0 - many
      'case_number':'psi:zaaknummer', # 1 - many  
      'procedures':'psi:procedure', # 0 - many
      'references':'dcterms:references', # 0 - many
      'relation':'dcterms:relation' # 0 - many
  }

  # Date type cases
  date_tag_pointer_dict = {
      'issue_date':'dcterms:issued', # 1 - 1
      'judgment_date':'dcterms:date' # 1 - 1
  }

  # Datetime type cases
  datetime_tag_pointer_dict = {
      'modified':'dcterms:modified' # 1 - 1
  }

  # Start with the single tag items
  for tag, pointer in tag_pointer_dict.items():
    try:
      rdf_tags[tag] = case_rdf.find(pointer).text
    except:
      #print("There was an error when trying to find '{}' for case {}".format(
      #    tag, file_name
      #))
      rdf_tags[tag] = 'NOT_FOUND'

  # Lets do the multi tags
  for tag, pointer in multi_tag_pointer_dict.items():
    rdf_tags[tag] = ()
    
    tag_mentions = case_rdf.find_all(pointer)
    for mention in tag_mentions:
      rdf_tags[tag] += (mention.text, )
    
  # Date tags (%Y-%m-%d)
  # Atm krijgen deze nog 'object' als dtype; fix iets
  for tag, pointer in date_tag_pointer_dict.items():
    try:
      tag_text = case_rdf.find(pointer).text
      rdf_tags[tag] = datetime.strptime(tag_text, '%Y-%m-%d').date()
    except:
      #print("There was an error when trying to find '{}' for case {}".format(
      #    tag, file_name
      #))
      rdf_tags[tag] = 'NOT_FOUND'

  # Datetime tags (%Y-%m-%dT%H:%M:%S)
  for tag, pointer in datetime_tag_pointer_dict.items():
    try:
      tag_text = case_rdf.find(pointer).text
      rdf_tags[tag] = datetime.strptime(tag_text, '%Y-%m-%dT%H:%M:%S')
    except:
      #print("There was an error when trying to find '{}' for case {}".format(
      #    tag, file_name
      #))
      rdf_tags[tag] = 'NOT_FOUND'

  return rdf_tags

## Collect all attributes and write to CSV
Last edit: 7-4-2021

In [47]:
years = list(range(1911, 2022))
years.remove(1912) # 1912 bestaat niet

years = range(2020, 2021)#(2020, 2021) 

for i in trange(len(years), desc='Total years'):
  current_year = years[i]

  # Get all zip file names
  zip_files = glob.glob(data_folder + '/' + str(current_year) + '/*')

  for i in trange(len(zip_files), desc='Archives in ' + str(current_year), leave=False):
    # Store all cases of current archive in these dfs
    cases_meta_df = pd.DataFrame(columns=["identifier",
                                          "missing_parts",
                                          "case_type",
                                          "case_number",
                                          "jurisdiction",
                                          "creator",
                                          "judgment_date",
                                          "relation",
                                          "procedures",
                                          "seat_location",
                                          "references",
                                          "publisher",
                                          "issue_date",
                                          "modified"
                                          #"language",
                                          #"format"
                                          ])
    
    cases_content_df = pd.DataFrame(columns=["identifier", 
                                            "summary", 
                                            "description"])
    zip_file = zip_files[i]
    
    # Make archive of the zip file
    archive = ZipFile(zip_file, 'r')
    archive_filenames = archive.namelist()
    
    for j in trange(len(archive_filenames), desc='Current archive', leave=False):
      file_name = archive_filenames[j]

      # Read the content of the zip file (XML) into bf4 parser
      case_rdf, case_summary, case_description = parse_xml(archive.read(file_name))

      # Parse case_rdf to get document attributes
      case_meta_info = get_document_attributes(case_rdf, file_name)
      
      # Will store the case description, summary and identifier
      case_content = {}

      # We will use the identifier, or ecli, as the primary key between both dfs
      case_content["identifier"] = case_meta_info["identifier"]

      # Will be stored as meta information
      missing = tuple()

      # Find the summary of the document
      if case_summary is not None:
        case_content["summary"] = case_summary.get_text("|||", strip=True) 
      else: 
        case_content["summary"] = "NOT_FOUND"
        missing = missing + ("summary",)
      
      # Find the full description of the case
      if case_description is not None:
        case_content["description"] = case_description.get_text("|||", strip=True) 
      else: 
        case_content["description"] = "NOT_FOUND"
        missing = missing + ("description",)

      # Add missing parts information to meta df
      case_meta_info["missing_parts"] = missing

      # Append to df
      cases_meta_df = cases_meta_df.append(case_meta_info, ignore_index=True)
      cases_content_df = cases_content_df.append(case_content, ignore_index=True)

    # Finally, save the fresh dfs to csv files
    year_path = data_folder + "/../CSV per year/" + str(current_year)

    # Cases meta
    cases_meta_df.to_csv(
          year_path + "_cases_meta.csv", 
          mode='a',
          index=False,
          header=not os.path.exists(year_path + "_cases_meta.csv"))

    # Cases content
    cases_content_df.to_csv(
          year_path + "_cases_content.csv", 
          mode='a',
          index=False,
          header=not os.path.exists(year_path + "_cases_content.csv"))
    
    logging.info(f'{current_year} - {i + 1} has been parsed and saved')



HBox(children=(FloatProgress(value=0.0, description='Total years', max=1.0, style=ProgressStyle(description_wi…

HBox(children=(FloatProgress(value=0.0, description='Archives in 2020', max=12.0, style=ProgressStyle(descript…

HBox(children=(FloatProgress(value=0.0, description='Current archive', max=11228.0, style=ProgressStyle(descri…

HBox(children=(FloatProgress(value=0.0, description='Current archive', max=12583.0, style=ProgressStyle(descri…

HBox(children=(FloatProgress(value=0.0, description='Current archive', max=11623.0, style=ProgressStyle(descri…

HBox(children=(FloatProgress(value=0.0, description='Current archive', max=9289.0, style=ProgressStyle(descrip…

HBox(children=(FloatProgress(value=0.0, description='Current archive', max=7202.0, style=ProgressStyle(descrip…

HBox(children=(FloatProgress(value=0.0, description='Current archive', max=9287.0, style=ProgressStyle(descrip…

HBox(children=(FloatProgress(value=0.0, description='Current archive', max=11191.0, style=ProgressStyle(descri…

HBox(children=(FloatProgress(value=0.0, description='Current archive', max=11448.0, style=ProgressStyle(descri…

HBox(children=(FloatProgress(value=0.0, description='Current archive', max=9640.0, style=ProgressStyle(descrip…

HBox(children=(FloatProgress(value=0.0, description='Current archive', max=11261.0, style=ProgressStyle(descri…

HBox(children=(FloatProgress(value=0.0, description='Current archive', max=12141.0, style=ProgressStyle(descri…

HBox(children=(FloatProgress(value=0.0, description='Current archive', max=12935.0, style=ProgressStyle(descri…




In [46]:
a = tuple()

a + ("ab",)

('ab',)

In [5]:
ab = tuple()
ab

()

## Get meta-infromation about the Data

In [None]:
# TEMP for project description table
# importing required modules 
from zipfile import ZipFile 
import glob
import pandas as pd
from tqdm.notebook import trange, tqdm

pd.set_option('display.max_columns', 500)
pd.set_option('max_colwidth', 400)
pd.options.display.width=None

data_folder = "/content/drive/MyDrive/Universiteit Utrecht/" \
"Business Informatics/Thesis Project/Implementation/Data/Cases per Year"

# Store all cases in this dataframe
counts_df = pd.DataFrame()

years = list(range(1911, 2022))
years.remove(1912) # 1912 bestaat niet

offset = 98 # 90 #temp
for i in trange(len(years)-offset, desc='Total years'):
  current_year = years[i+offset]

  cy_case_count = 0
  cy_missing_descriptions = 0
  cy_missing_summaries = 0 
  cy_complete_case = 0
  cy_short_summary = 0

  log = "###\nStarting year {}".format(current_year)

  # Get all zip file names
  zip_files = glob.glob(data_folder + '/' + str(current_year) + '/*')

  for i in trange(len(zip_files), desc='Archives in ' + str(current_year), leave=False): #1, desc='Total archives'):
    zip_file = zip_files[i]
    
    # Make archive of the zip file
    archive = ZipFile(zip_file, 'r')
    archive_filenames = archive.namelist()
    
    cy_case_count += len(archive_filenames)
    
    for j in trange(len(archive_filenames), desc='Current archive', leave=False):
      file_name = archive_filenames[j]

      # Read the content of the zip file (XML) into bf4 parser
      case_rdf, case_summary, case_description = parse_xml(archive.read(file_name))

      if case_summary == None:
        cy_missing_summaries += 1
      else: 
        stripped_sum = case_summary.text.strip()
        stripped_sum_len = len(stripped_sum)

        #print("{} has the following length: {}.".format(file_name, len(case_summary.text)))
        
        # Count number of words
        number_of_words = len(stripped_sum.split())
        # Probably '-' or 'n.v.t'
        if number_of_words == 1: 
          #print("{} has only one word: {}.".format(file_name, stripped_sum))
          cy_missing_summaries += 1
        elif number_of_words <= 10:
          log += "\n{} has less than 10 words: {}.".format(file_name, stripped_sum)
          cy_short_summary += 1

      if case_description == None:
        cy_missing_descriptions += 1

      if case_description != None and case_summary != None:
        cy_complete_case += 1
      if case_description == None and case_summary != None:
        print(file_name)
      
  counts_dict = {}
  counts_dict["current_year"] = current_year
  counts_dict["case_count"] = cy_case_count
  counts_dict["missing_desc"] = cy_missing_descriptions
  counts_dict["missing_sums"] = cy_missing_summaries
  counts_dict["complete_cases"] = cy_complete_case
  counts_dict["short_summary"] = cy_short_summary

  counts_df = counts_df.append(counts_dict, ignore_index=True)
  #print(counts_df)

  # Write year's log to log file
  #print(log)
  with open(data_folder + "/../" + "nachtrun_log.txt", "a") as logfile:
    logfile.write(log + "\n\n\n")

  # Finally, save the fresh df to a csv file
  counts_df.to_csv(data_folder + "/../" +  "counts_lawsuits_nachtrunvanaf2010.csv", index=False)



HBox(children=(FloatProgress(value=0.0, description='Total years', max=12.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='Archives in 2010', max=12.0, style=ProgressStyle(descript…

HBox(children=(FloatProgress(value=0.0, description='Current archive', max=13830.0, style=ProgressStyle(descri…

NameError: ignored

In [None]:
# Rearrange the columns and append both splits
import pandas as pd
df1 = pd.read_csv(data_folder + "/../" + "counts_lawsuits_nachtrun_tm2009.csv")  
df2 = pd.read_csv(data_folder + "/../" + "counts_lawsuits_nachtrunvanaf2010.csv") 

data_all_years = df1.append(df2)

data_all_years = data_all_years[['current_year','case_count','complete_cases','missing_desc','missing_sums', 'short_summary']]

data_all_years.to_csv(data_folder + "/../" +  "lawsuits_counts_allyears.csv", index=False)

# Load Data 
Last edit: 7-4-2021

For debugging etc.

In [48]:
import random 

year_meta_df = pd.read_csv(f"{data_folder}/../CSV per year/{years[0]}_cases_meta.csv")
year_content_df = pd.read_csv(f"{data_folder}/../CSV per year/{years[0]}_cases_content.csv")

In [51]:
sample = random.randint(0, len(year_meta_df))
print(sample)
#print(year_meta_df.dtypes)
year_content_df.loc[sample, :]

99235


identifier                                                                                                                                                                                                                                                                                                                                                                                             ECLI:NL:RBLIM:2020:8119
summary                                                            Het verzoek om voorlopige voorzieningen, inhoudende de verleende subsidie stop te zetten en de al betaalde subsidiegelden terug te vorderen, wordt afgewezen, omdat het gevraagde buiten de omvang van het geding in de hoofdzaak valt, waarin enkel het wijzigingsbesluit voorligt over de wijziging van het projectplan waarvoor de subsidie is verleend.
description    RECHTBANK limburg|||Bestuursrecht|||zaaknummer: AWB 20/2219|||uitspraak van de voorzieningenrechter van 8 oktober 2020 op het verzoek om voorlopige voorzie

In [None]:
import fastparquet

# Save the dataset as pickle 
year_content_df.to_pickle(f"{data_folder}/../CSV per year/{years[0]}_cases_content.p")

# Save the dataset as parquet
year_content_df.to_parquet(f"{data_folder}/../CSV per year/{years[0]}_cases_content.parquet")

1

# Notes and ideas
Last edit: 7-4-2021

Would be nice to check whether every document is of the structure rdf-inhoudsindicatie-uitspraak/conclusie . This is easy to check

Data documentation should be checked in order to find the right cardinalities between elements