In [28]:
import os
import json
import requests
from bs4 import BeautifulSoup


def stringify_children(node):
    """
    Convert the children of a node into a single string, maintaining order.
    """
    parts = ([node] if node.string else []) + list(node.children)
    return ''.join(str(x) for x in parts)


def extract_urls(url):
    """
    Extract the individual document urls from a given URL.
    """
    response = requests.get(url)
    html_content = response.text

    soup = BeautifulSoup(html_content, "html.parser")

    # Find the element that contains the legislation cards
    data = soup.find("div", class_="row gy-32 targeted-row")
    cards = data.select('div[class*="ps-document-card-type-legislation"]')
    urls = []

    for index, card in enumerate(cards):
        # Extract the legislation link from the card's anchor element
        card_title = card.find("h2").get_text(strip=True)
        card_link = card.find("a")['href']

        # Append the URL 
        urls.append(
            {
                "name": card_title,
                "link": card_link
            })

    return urls


def save_urls_to_file(filename, url):
    """
    Extract and save URLs to a JSON file.
    """
    extracted_urls = extract_urls(url)
    
    # Ensure the 'faq_data' subdirectory exists, or create it if not
    os.makedirs("../data/processed/legislation_data", exist_ok=True)

    # Save the aggregated FAQ data to a specified JSON file
    with open(os.path.join("../data/processed/legislation_data/", f"{filename}.json"), "w") as outfile:
        json.dump(extracted_urls, outfile, indent=4, ensure_ascii=False)

In [29]:
save_urls_to_file(filename="National_Legislation_TR", url = "https://multecihukuku.net/mevzuat/?document-type=ulusal-mevzuat")

In [44]:
import os
import json
import requests
from bs4 import BeautifulSoup


def stringify_children(node):
    """
    Convert the children of a node into a single string, maintaining order.
    """
    parts = ([node] if node.string else []) + list(node.children)
    return ''.join(str(x) for x in parts)


def extract_legislation_data(url):
    """
    Extract legislation data from a given URL.
    """
    response = requests.get(url)
    html_content = response.text

    soup = BeautifulSoup(html_content, "html.parser")

    # Extract the page title up to the specified delimiter
    page_title = soup.title.string.split(" - ")[0]

    # Find the element containing the legislation
    block_text = soup.find("section", class_="paragraph-text")
    if (block_text is not None):
        block_text_content = stringify_children(block_text)
    else: 
        block_text_content = ''
    
    accordion = soup.find("div", class_="accordion")
    if (accordion is not None):
        cards = accordion.find_all("div", class_="accordion-item")
    else: 
        cards = []
    
    card_data = []

    for index, card in enumerate(cards):
        # Extract the question from the card's anchor element
        title_element = card.find("button")
        title = title_element.get_text(strip=True)

        # Extract the answer from the card's body element and convert it to HTML string
        content_element = card.find("div", class_="accordion-body")
        content_html = stringify_children(content_element)

        # Append the FAQ entry with the specified language
        card_data.append(
            {
                "section": title,
                "content": content_html,
            })

    legislation_data = []
    legislation_data.append({
        "title": page_title,
        "block_content": block_text_content,
        "accordion_content": card_data
    })
    return legislation_data

In [46]:
extract_legislation_data("https://multecihukuku.net/ulusal-mevzuat/ailenin-korunmasi-ve-kadina-yonelik-siddetin-onlenmesi-kanunu-ilgili-maddeler/")

[{'title': 'Ailenin Korunması ve Kadına Karşı Şiddetin Önlenmesine Dair Kanun',
  'block_content': '\n<div class="container">\n<div class="row">\n<div class="col-12 m-auto col-lg-10">\n<p><strong>Kanun Numarası: 6284<br/>\n</strong><strong>Kabul Tarihi: 8/3/2012<br/>\n</strong><strong>Yayımlandığı R.Gazete: Tarih: 20/3/2012\xa0\xa0\xa0\xa0 Sayı:\xa0 28239<br/>\n</strong><strong>Yayımlandığı Düstur: Tertip: 5\xa0 Cilt: 52</strong></p>\n</div>\n</div>\n</div>\n',
  'accordion_content': [{'section': 'BİRİNCİ BÖLÜM - Amaç, Kapsam, Temel İlkeler ve Tanımlar',
    'content': '<p><strong>Amaç, kapsam ve temel ilkeler</strong></p>\n<p><strong>MADDE 1 –</strong> (1) Bu Kanunun amacı; şiddete uğrayan veya şiddete uğrama tehlikesi bulunan kadınların, çocukların, aile bireylerinin ve tek taraflı ısrarlı takip mağduru olan kişilerin korunması ve bu kişilere yönelik şiddetin önlenmesi amacıyla alınacak tedbirlere ilişkin usul ve esasları düzenlemektir.</p>\n<p>(2) Bu Kanunun uygulanmasında ve gereke