# Splitting EU Texts into Articles

In [1]:
#!pip install bs4

In [2]:
from bs4 import BeautifulSoup
import os 
import codecs
import re

## Functions and preparation

In [3]:
### structural components of legal texts

articles_enumerated =  ['Article {}'.format(i) for i in range(1,350)]

sections = ['\nSection 1\n', '\nSection 2\n', '\nSection 3\n', '\nSection 4\n', 
            '\nSection 5\n', '\nSection 6\n', '\nSection 7\n', 
            '\nSECTION 1\n', '\nSECTION 2\n', '\nSECTION 3\n', '\nSECTION 4\n', 
            '\nSECTION 5\n', '\nSECTION 6\n', '\nSection 7\n', 
            'Section 1', 'Section 2', 'Section 3', 'Section 4', 
            'Section 5', 'Section 6', 'Section 7', 
            'SECTION 1', 'SECTION 2', 'SECTION 3', 'SECTION 4', 
            'SECTION 5', 'SECTION 6', 'SECTION 7'] 

chapters=  ['CHAPTER I', 'CHAPTER II', 'CHAPTER III', 'CHAPTER IV', 'CHAPTER V', 'CHAPTER VI', 'CHAPTER VII',
            'CHAPTER 1', 'CHAPTER 2', 'CHAPTER 3', 'CHAPTER 4', 'CHAPTER 5', 'CHAPTER 6', 'CHAPTER 7',
            '\nCHAPTER I\n', '\nCHAPTER II\n', '\nCHAPTER III\n', '\nCHAPTER IV\n', '\nCHAPTER V\n', 
            '\nCHAPTER VI\n', '\nCHAPTER VII\n',
            '\nCHAPTER 1\n', '\nCHAPTER 2\n', '\nCHAPTER 3\n', '\nCHAPTER 4\n', '\nCHAPTER 5\n', 
            '\nCHAPTER 6\n', '\nCHAPTER 7\n',]
        
titles = ['TITLE I', 'TITLE II', 'TITLE III', 'TITLE IV', 'TITLE V', 'TITLE VI', 'TITLE VII', 'TITLE VIII'
         'TITLE 1', 'TITLE 2', 'TITLE 3', 'TITLE 4', 'TITLE 5', 'TITLE 6', 'TITLE 7', 'TITLE 8']

### Function to split the laws
The function 'process_text' takes in an html file of an EU law as can be dowloaded from EUR-Lex https://eur-lex.europa.eu/homepage.html and splits it into articles. It assumes that the files are stored in the directory 'texts'. Refer to download_searches.py for downloading EU laws.

In [4]:
## get all EU text
#tp = '2000-2019'
tp = '2020-2024'

texts = os.listdir(f'texts/{tp}')

In [5]:
def process_text(text, tp):
    # Create proper paths
    input_path = f"texts/{tp}/{text}"
    output_base_dir = f"processed/{tp}"
    
    # Ensure the output base directory exists
    os.makedirs(output_base_dir, exist_ok=True)
    
    # Read legal text
    try:
        f = codecs.open(input_path, 'r', 'utf-8')
        # Parse with beautiful soup
        soup = BeautifulSoup(f, 'html.parser')
        # Close file
        f.close()
    except FileNotFoundError:
        print(f"File not found: {input_path}")
        return
    
    # Only use body text
    body = soup.find('body')
    text_only = body
    # Create list with paragraphs
    paragraphs = text_only.find_all('p')
    
    i=0  # Article counter
    j=0  # Title counter
    k=0  # Chapter counter
    l=0  # Section counter
    
    # Create output directory for this specific text
    text_output_dir = f"{output_base_dir}/{text[:-5]}"
    os.makedirs(text_output_dir, exist_ok=True)
    
    # Open new file for the front text
    file = open(f"{text_output_dir}/{text[:-5]}_front.txt", "w", encoding='utf-8')
    
    # Create iterable for paragraphs (useful for skipping certain paragraphs)
    paragraphs_iter = iter(paragraphs[3:])
    
    # ITERATE OVER PARAGRAPHS
    for paragraph in paragraphs_iter:
        string = paragraph.text.replace(u'\xa0', u' ')
        
        # Catch whereas
        if string == 'Whereas:':
            file.close()
            file = open(f"{text_output_dir}/{text[:-5]}_Whereas.txt", "w", encoding='utf-8')
        
        if string in titles:
            j+=1
            # Resets chapter index
            k=0
            next(paragraphs_iter)
            continue
        
        if string in chapters:
            k+=1
            next(paragraphs_iter)
            continue
        
        if string in sections:
            l+=1
            next(paragraphs_iter)
            continue
        
        # Catch ending
        if string == 'For the European Parliament':
            file.close()
            break
        if string[:18] == 'Done at Luxembourg':
            file.close()
            break
        if string[:16] == 'Done at Brussels':
            file.close()
            break 
        if string[:18] == 'Done at Strasbourg':
            file.close()
            break    
        
        if string in articles_enumerated:        
            file.close()
            i += 1
            article_filename = f"{text_output_dir}/{text[:-5]}_Title_{j}_Chapter_{k}_Section_{l}_Article_{'0' * (3-len(str(i)))}{i}.txt"
            file = open(article_filename, "w", encoding='utf-8')
            file.write(paragraph.text + '\n')
        else:    
            file.write(paragraph.text + '\n')
    
    file.close()
    print(f"Processed: {text}")

## Processing the text
This part executes the function to split the laws into articles and and saves them in a folder 'processed'. All articles for each law are stored in a separate folder that is labeled with the respective CELEX number.

In [6]:
# To use the function:
for text in texts:
    process_text(text, tp)  # Make sure 'tp' is defined before this loop

Processed: EU_32021L0338.html
Processed: EU_32021L2118.html
Processed: EU_32021R0168.html
Processed: EU_32021R0240.html
Processed: EU_32021R0241.html
Processed: EU_32021R0267.html
Processed: EU_32021R0444.html
Processed: EU_32021R0522.html
Processed: EU_32021R0523.html
Processed: EU_32021R0690.html
Processed: EU_32021R0691.html
Processed: EU_32021R0692.html
Processed: EU_32021R0693.html
Processed: EU_32021R0694.html
Processed: EU_32021R0695.html
Processed: EU_32021R0696.html
Processed: EU_32021R0697.html
Processed: EU_32021R0782.html
Processed: EU_32021R0783.html
Processed: EU_32021R0785.html
Processed: EU_32021R0817.html
Processed: EU_32021R0818.html
Processed: EU_32021R0819.html
Processed: EU_32021R0821.html
Processed: EU_32021R0836.html
Processed: EU_32021R0888.html
Processed: EU_32021R0947.html
Processed: EU_32021R1056.html
Processed: EU_32021R1057.html
Processed: EU_32021R1058.html
Processed: EU_32021R1059.html
Processed: EU_32021R1060.html
Processed: EU_32021R1077.html
Processed: