# Adding Hints to the Family Tree Proyect

In [1]:
%pip install BeautifulSoup4
%pip install markdownify

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import os
import requests
import time
from typing import Optional
from typing import Tuple
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from markdownify import markdownify as md

In [5]:
host = 'https://www.familysearch.org'
base_dir = 'data/raw'
bs_parser = 'html.parser'
delay_seconds = 5
site = 'source-linker-learning-center'

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [6]:
# Function to get the article page from URL and return status code and response text as tuple of strings
def get_page(
    url: str,
    delay_seconds: int = 30,
    headers: Optional[dict[str, str]] = None,
    encoding: str = "utf-8",
    timeout: int = 30,
) -> Tuple[str, str]:
    if headers is None:
        headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "en-US,en;q=0.9",
        "Cache-Control": "no-cache",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    }
    # Get Response from URL and return status code and response text
    response = requests.get(url, headers=headers, timeout=timeout)
    time.sleep(delay_seconds)
    if encoding:
        response.encoding = encoding
    return response.status_code, response.text


# Get Response from URL with raise exception if status code is not 200
def get_response_from_url(url: str, delay_seconds: int = 30) -> str:
    status_code, response = get_page(url, delay_seconds)
    if status_code != 200:
        raise Exception(f"Failed to get response from {url}")
    return response

In [7]:
def _is_article(url):
    """A talk URL has 6 components (first component is empty) and last component does not end in -session.
    """  
    path_components = urlparse(url).path.split('/')
    return len(path_components) == 6 and not path_components[-2].endswith('fieldops')

In [8]:
def get_article_urls(base_url):
    """Find and return article URLs from the base URL."""
    dir_html = get_response_from_url(base_url, delay_seconds)
    soup = BeautifulSoup(dir_html, bs_parser)
    return set(urljoin(base_url, a['href']) for a in soup.find_all('a',href=True)\
        if _is_article(urljoin(base_url, a['href'])))

In [9]:
# Get URLs from the host page and save them to an array
dir_url = f"{host}/en/help/helpcenter/{site}"
urls = get_article_urls(dir_url)
urls = list(urls)
print(dir_url, len(urls))

https://www.familysearch.org/en/help/helpcenter/source-linker-learning-center 22


In [10]:
# urls = [
#     'https://example.com/page1',
#     'https://example.com/page2',
#     # añade más URLs según sea necesario
# ]

In [11]:
def clean_html(soup):
    for img in soup.find_all("img"):
        img.decompose()
    for span in soup.find_all("div", class_="Enhancement"):
        span.decompose()
    for div in soup.find_all("div", class_="ArticlePage-byline"):
        div.decompose()

    #     # Limpiar espacios en blanco adicionales en el HTML
    # for tag in soup.find_all(True):  # True encuentra todas las etiquetas
    #     tag.string = tag.get_text(strip=True)
    
    return soup

In [12]:
def extract_information(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, bs_parser)
        content = soup.find("div", class_="ArticlePage-wrapper")
        if content:
            cleaned_content = clean_html(content)
            # Eliminar líneas vacías adicionales
            cleaned_content = str(cleaned_content)
            cleaned_content = "\n".join(line.strip() for line in cleaned_content.splitlines() if line.strip())
            return md(cleaned_content, heading_style="ATX")
        return "No content found"
    except requests.exceptions.RequestException as e:
        return f"Error: {e}"

In [13]:
def faq_main_page(base_url):
    try:
        dir_html = get_response_from_url(base_url, delay_seconds)
        soup = BeautifulSoup(dir_html, bs_parser)
        content = soup.find_all("div", class_="PromoFAQ-content")
        if content:
            # Convertir el contenido a texto antes de pasarlo a `md`
            content_str = "\n".join([str(item) for item in content])
            return md(str(content_str), heading_style="ATX")
        return "No content found"
    except requests.exceptions.RequestException as e:
        return f"Error: {e}"

In [14]:
def get_article_path(i):
    """Return the file path for saving the talk."""
    path_components = urlparse(urls[i]).path.split('/')
    title = [word for word in path_components[-1].split('-')]
    if len(title) > 3:
        return f"{title[0]}-{title[1]}-{title[2]}-{title[3]}.md"
    return f"{title[0]}-{title[1]}-{title[2]}.md"
    # year, month, title = path_components[3:6]
    #path_components[3:6] = ['2024', '04', '11oaks']
    # return os.path.join(base_dir, f"{year}-{month}-{title}.json")
    # os.path.join(base_dir) = 'data/raw'

In [15]:
def one_file_content(content, filename = "data.md"):
    file_path = os.path.join(base_dir, filename)
    with open(file_path, "w", encoding="utf-8") as file:
        for idx, info in enumerate(content):
            file.write(f"### Section {idx+1}\n")
            file.write(f"{info}")

In [16]:
def create_readme(full_content, filename="data.md"):
    one_file_content(full_content)
    for idx, content in enumerate(full_content):
        filename = get_article_path(idx)
        if os.path.exists(filename):
            continue
        print("    ", filename)
        file_path = os.path.join(base_dir, filename)
        with open(file_path, "w", encoding="utf-8") as file:  
            file.write(f"{content}")

In [17]:
# Extraer información de todas las URLs
all_content = [extract_information(url) for url in urls]
all_content.append(faq_main_page(dir_url)) 
urls.append(dir_url)
# Crear archivo markdown con el contenido extraído
create_readme(all_content)

     how-tagging-works.md
     understanding-the-source-linker.md
     creating-new-people-in.md
     not-your-family-find.md
     how-do-i-handle.md
     attaching-a-source-to.md
     viewing-the-source-image.md
     what-to-do-when.md
     new-look-and-feel.md
     what-are-the-benefits.md
     editing-information-in-your.md
     viewing-the-record-or.md
     understanding-the-focus-person.md
     view-records-and-tree.md
     entering-a-reason-statement.md
     what-is-a-source.md
     using-drag-and-drop.md
     what-are-record-hints.md
     using-source-linker-to.md
     attaching-a-source-to.md
     detaching-sources-that-should.md
     determining-if-a-source.md
     source-linker-learning-center.md
