# Ingesting Turkish Legislation Data from MulteciHukuku.net
This notebook extracts national legislation documents from the Turkish-language web resource, MulteciHukuku.net, for later use in a RAG pipeline.

## Step 1: Extracting Legislation Document URLs
In order to scrape national legislation document contents, we first need to assemble a list of available documents. We'll do so by extracting the title and href of all relevant documents listed on the "National Legislation" (*Ulusal Mevzuat*) page.

In [4]:
import os
import json
import requests
from bs4 import BeautifulSoup


def stringify_children(node):
    """
    Convert the children of a node into a single string, maintaining order.
    """
    parts = ([node] if node.string else []) + list(node.children)
    return ''.join(str(x) for x in parts)


def extract_urls(url):
    """
    Extract the individual document urls from a given URL.
    """
    response = requests.get(url)
    html_content = response.text

    soup = BeautifulSoup(html_content, "html.parser")

    # Find the element that contains the legislation cards
    data = soup.find("div", id="nav-national-legislation")
    cards = data.select('div[class*="ps-document-card-type-legislation"]')
    urls = []

    for index, card in enumerate(cards):
        # Extract the legislation link from the card's anchor element
        card_title = card.find("h2").get_text(strip=True)
        card_link = card.find("a")['href']

        # Append the URL 
        urls.append(
            {
                "name": card_title,
                "link": card_link
            })

    return urls


def save_urls_to_file(filename, url):
    """
    Extract and save URLs to a JSON file.
    """
    extracted_urls = extract_urls(url)
    
    # Ensure the 'faq_data' subdirectory exists, or create it if not
    os.makedirs("../data/processed/legislation_data", exist_ok=True)

    # Save the aggregated FAQ data to a specified JSON file
    with open(os.path.join("../data/processed/legislation_data/", f"{filename}.json"), "w") as outfile:
        json.dump(extracted_urls, outfile, indent=4, ensure_ascii=False)

In [5]:
# Run the save_urls_to_file function, saving the results as a JSON file.
save_urls_to_file(filename="National_Legislation_TR", url = "https://multecihukuku.net/mevzuat/?document-type=ulusal-mevzuat")

# To make things easier in the next steps, we'll also save just a list of the extracted URLs by calling the extract_urls function and mapping through it to extract the links.
legislation_urls = extract_urls("https://multecihukuku.net/mevzuat/?document-type=ulusal-mevzuat")
urls = [entry['link'] for entry in legislation_urls]

In [6]:
# Check the first entry in the urls list to make sure we extracted the urls properly
print(urls[0])

https://multecihukuku.net/ulusal-mevzuat/acil-saglik-hizmetleri-yonetmeligi/


## Step 2: Extract Legislation Document Contents
Next, we'll need to define a function to loop through the provided list of urls, extracting the contents of each legislation document. The structure of these pages is variable, but every document includes at least one of the following:
- Block text;
- Text split into one or more 'accordion' sections

Finally, we'll save the resulting documents in a JSON file for later use in a RAG pipeline.


In [7]:
def extract_legislation_data(url):
    """
    Extract legislation data from a given URL.
    """
    
    response = requests.get(url)
    html_content = response.text

    soup = BeautifulSoup(html_content, "html.parser")

    # Extract the page title up to the specified delimiter
    page_title = soup.title.string.split(" - ")[0]

    # Initialize a list to hold all content
    all_content = []

    # Find the element containing the legislation block text and add it to all_content
    block_text = soup.find("section", class_="paragraph-text")
    if block_text is not None:
        block_text_content = stringify_children(block_text)
        all_content.append(block_text_content)

    # Find accordion elements and add their content to all_content
    accordion = soup.find("div", class_="accordion")
    if accordion is not None:
        cards = accordion.find_all("div", class_="accordion-item")
        for card in cards:
            title_element = card.find("button")
            title = title_element.get_text(strip=True)

            content_element = card.find("div", class_="accordion-body")
            content_html = stringify_children(content_element)

            # You can format each section as you prefer, here we use a dict for each accordion section
            all_content.append({"section": title, "content": content_html})

    # Combine all content and append the URL to the legislation_data dict
    legislation_data = {
        "title": page_title,
        "content": all_content,
        "url": url
    }
    
    return legislation_data



def extract_from_urls(urls):
    """
    Extract data from a list of urls.
    """
    legislation_data = []

    for url in (urls):
        legislation_data.append(extract_legislation_data(url))
    return legislation_data

def save_data_to_file(filename, urls):
    """
    Extract and save legislation content from a list of URLs to a JSON file.
    """

    data = extract_from_urls(urls)
    
    # Ensure the 'faq_data' subdirectory exists, or create it if not
    os.makedirs("../data/processed/legislation_data", exist_ok=True)

    # Save the aggregated FAQ data to a specified JSON file
    with open(os.path.join("../data/processed/legislation_data/", f"{filename}.json"), "w") as outfile:
        json.dump(data, outfile, indent=4, ensure_ascii=False)


In [8]:
save_data_to_file("National_Legislation_Content_TR", urls = urls)