## Data Collection

#### Project Description
In this notebook, we collect using crawling and clean data for building a chatbot. The data will be retrieved from multiple web sources, cleaned, and prepared for further use in training a language model.


In [10]:
# Magic commands
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

#### Importing Libraries

In [11]:
import os
import requests
import time
from typing import Optional
from typing import Tuple
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import pandas as pd
import json

#### Configuration and Setup for Data Collection

This block of code sets up the basic configuration needed for the data collection process. Additionally, it ensures that the directory for storing data exists, creating it if necessary.


In [65]:
host = 'https://www.familysearch.org'
base_dir = '../data/raw'
bs_parser = 'html.parser'
delay_seconds = 5
site = 'source-linker-learning-center'
site2 = 'article/unfinished-attachments-how-to-add-others-on-record-to-family-tree#attaching-individuals-from-the-record-to-a-new-profile-in-family-tree'

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

Get the article page from URL and return status code and response text as tuple of strings

In [13]:
def get_page(
    url: str,
    delay_seconds: int = 30,
    headers: Optional[dict[str, str]] = None,
    encoding: str = "utf-8",
    timeout: int = 30,
) -> Tuple[str, str]:
    if headers is None:
        headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "en-US,en;q=0.9",
        "Cache-Control": "no-cache",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    }
    # Get Response from URL and return status code and response text
    response = requests.get(url, headers=headers, timeout=timeout)
    time.sleep(delay_seconds)
    if encoding:
        response.encoding = encoding
    return response.status_code, response.text


# Get Response from URL with raise exception if status code is not 200
def get_response_from_url(url: str, delay_seconds: int = 30) -> str:
    status_code, response = get_page(url, delay_seconds)
    if status_code != 200:
        raise Exception(f"Failed to get response from {url}")
    return response

In [14]:
def _is_article(url):
    """A talk URL has 6 components (first component is empty) and last component does not end in -session.
    """  
    path_components = urlparse(url).path.split('/')
    return len(path_components) == 6 and not path_components[-2].endswith('fieldops')

In [15]:
def get_article_with_urls(base_url):
    """
    Find and return article URLs from the base URL, recursively following links found in each page.
    
    Parameters:
        - base_url: The URL to fetch and scan for articles.
        - visited: A set of URLs that have already been visited to avoid repetition.
        - max_depth: Maximum recursion depth to prevent infinite loops.
        - depth: Current recursion depth.
        
    Returns:
        A set of all found URLs.
    """
    # Set to store all unique URLs
    article_urls = set()
    dir_html = get_response_from_url(base_url, delay_seconds)
    soup = BeautifulSoup(dir_html, bs_parser)
    
    # Step 1: Collect URLs from the main page
    for a in soup.find_all('a', href=True):
        url = urljoin(base_url, a['href'])
        if _is_article(url):
            article_urls.add(url)
    
    # Step 2: For each article URL, fetch and collect the links inside those pages
    for article_url in list(article_urls):  # Create a list to avoid modifying the set while iterating
        inner_html = get_response_from_url(article_url, delay_seconds)
        if not inner_html:
            continue  # Skip if the inner page couldn't be fetched
        
        inner_soup = BeautifulSoup(inner_html, bs_parser)
        for a in inner_soup.find_all('a', href=True):
            # Check if the 'href' attribute exists and is not empty
            if 'href' in a.attrs and a['href']:
                inner_url = urljoin(article_url, a['href'])
                if _is_article(inner_url):
                    article_urls.add(inner_url)
    
    return article_urls

In [16]:
def get_article_main_urls(base_url):
    """Find and return article URLs from the base URL."""
    dir_html = get_response_from_url(base_url, delay_seconds)
    soup = BeautifulSoup(dir_html, bs_parser)
    return set(urljoin(base_url, a['href']) for a in soup.find_all('a',href=True)\
        if _is_article(urljoin(base_url, a['href'])))

Get URLs from the host page and save them to an array

In [17]:
# Website 1: Get URLs with internal links
dir_url = f"{host}/en/help/helpcenter/{site}"
urls_with_links = get_article_with_urls(dir_url)
# Website 2: Get URLs without following internal links
dir_url2 = f"{host}/en/help/helpcenter/{site2}"
urls_no_links = get_article_main_urls(dir_url2)
# Combine both sets of URLs
urls = urls_with_links.union(urls_no_links)
print(dir_url, len(urls))

https://www.familysearch.org/en/help/helpcenter/source-linker-learning-center 40


In [18]:
urls = list(urls)
print(len(urls))

40


In [19]:
# Set the display options for Pandas
pd.set_option('display.width', 1000)  # Adjust the overall width
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Disable column width limitation
pd.set_option('display.expand_frame_repr', False)  # Prevent line wrapping of the DataFrame
# Create a DataFrame to hold the URLs
df = pd.DataFrame(urls, columns=["URL"])
# Print the DataFrame
df

Unnamed: 0,URL
0,https://www.familysearch.org/en/help/helpcenter/article/how-do-i-attach-record-hints-in-family-tree
1,https://www.familysearch.org/en/help/helpcenter/article/how-do-i-attach-source-from-source-box
2,https://www.familysearch.org/en/help/helpcenter/article/attaching-a-source-to-a-person-with-multiple-spouses
3,https://www.familysearch.org/en/help/helpcenter/article/how-do-i-attach-the-other-people-in-a-record-hint-in-family-tree
4,https://www.familysearch.org/en/help/helpcenter/article/detaching-sources-that-should-not-be-attached
5,https://www.familysearch.org/en/help/helpcenter/article/understanding-the-focus-person-in-source-linker
6,https://www.familysearch.org/en/help/helpcenter/article/how-do-i-add-missing-family-members-to-family-tree-from-record-hints
7,https://www.familysearch.org/en/help/helpcenter/article/how-do-i-create-a-new-source-in-source-box
8,https://www.familysearch.org/en/help/helpcenter/article/using-source-linker-to-attach-sources
9,https://www.familysearch.org/en/help/helpcenter/article/how-do-i-correct-parent-child-relationships-in-family-tree


Save the information in JSON in the processed folder

In [20]:
# Save the DataFrame to a CSV file
processed_dir = '../data/processed'
# Create the directory if it doesn't exist
if not os.path.exists(processed_dir):
    os.makedirs(processed_dir)

# Convert URLs into a list of dictionaries
urls_list = [{"url": url} for url in urls]  # Each URL as a dictionary

# Step 1: Save URLs to a JSON file
def save_urls_to_json(url_list, filename):
    with open(filename, "w", encoding="utf-8") as json_file:
        json.dump(url_list, json_file, ensure_ascii=False, indent=4)

json_file_path = os.path.join(processed_dir, 'downloaded_urls.json')
save_urls_to_json(urls_list, json_file_path)
print(f"URLs saved to {json_file_path} as JSON.")

URLs saved to ../data/processed\downloaded_urls.json as JSON.


Clean the content removing the unecessary tags and return a clean data

In [138]:
def clean_html(soup):
    """
    Clean HTML from unwanted elements.
    """  
    # # Attempt to find and extract the title if it's within the body
    # title = soup.find("h1", class_="ArticlePage-headline")
    # if title:
    #     title_text = title.get_text(" ", strip=True)
    #     title.replace_with(f"# {title_text}\n")
        
    content_container = soup.find("div", class_="KnowledgeArticleBodyItem-body")
    if content_container:
        for img in content_container.find_all("img"):
            img.decompose()
        videos = content_container.find_all("div", class_="Enhancement")
        if videos:
            for video in videos:
                iframe = video.find("iframe")
                if iframe:
                    src = iframe.get("src")
                    video.replace_with(f"Video: {src}\n")
                else:
                    video.decompose()
        for div in content_container.find_all("div", class_="ArticlePage-byline"):
            div.decompose()
        for div in content_container.find_all("div", class_="ArticlePage-actions"):
            div.decompose()
        # Get the text from the paragraphs and replace the paragraphs with the text
        # paragraphs = content_container.find_all("p")
        # for paragraph in paragraphs:
        #     new_text = paragraph.get_text(" ", strip=True)  # Add space between words
        #     paragraph.replace_with(new_text)
    return soup

In [130]:
def extract_information(url):
    """
    Extract information from the URL.
    """  
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, bs_parser)
        content = soup.find("div", class_="ArticlePage-wrapper")
        # Extract the title of the page
        title_tag = soup.find("h1", class_="ArticlePage-headline")
        title = title_tag.get_text() if title_tag else "Untitled Article"
        # Extract the published date of the page
        date_tag = soup.find("div", class_="ArticlePage-datePublished")
        date = date_tag.get_text() if date_tag else "Unknown Publication Date"
        if content:
            cleaned_content = clean_html(content)
            cleaned_content = str(cleaned_content)
            cleaned_content = "\n".join(line.strip() for line in cleaned_content.splitlines() if line.strip())
            markdown_content = md(cleaned_content, heading_style="ATX")
            # Return the cleaned markdown content along with metadata (title, url, date)
            return {
                "title": title,
                "date": date,  
                "url": url,
                "content": markdown_content
            }
        return None
    except requests.exceptions.RequestException as e:
        return {"error": str(e)}

In [120]:
def faq_main_page(base_url):
    try:
        dir_html = get_response_from_url(base_url, delay_seconds)
        soup = BeautifulSoup(dir_html, bs_parser)
        content = soup.find_all("div", class_="PromoFAQ-content")
        if content:
            # Convert content to a structured list
            faq_data = [item.get_text(strip=True) for item in content]
            return faq_data
        return "No FAQ content found"
    except requests.exceptions.RequestException as e:
        return f"Error: {e}"

In [121]:
def get_article_path(i,url):
    """
    Return the file path for saving the article.
    """
    path_components = urlparse(url).path.split('/')
    title = [word for word in path_components[-1].split('-')]
    if len(title) > 3:
        return f"{title[0]}-{title[1]}-{title[2]}-{title[3]}.md"
    return f"{title[0]}-{title[1]}-{title[2]}.md"

In [122]:
def create_markdown_file(content, metadata, filename):
    """
    Create a Markdown file with YAML front matter at the top.
    """
    file_path = os.path.join(base_dir, filename)
    with open(file_path, "w", encoding="utf-8") as file:
        # Write YAML front matter
        file.write(f"---\n")
        file.write(f"title: \"{metadata['title']}\"\n")
        file.write(f"date: \"{metadata['date']}\"\n")
        file.write(f"url: \"{metadata['url']}\"\n")
        file.write(f"---\n\n")
        
        # Write content
        file.write(content)

In [123]:
def one_file_content(content, filename = "data.md"):
    file_path = os.path.join(base_dir, filename)
    with open(file_path, "w", encoding="utf-8") as file:
        for idx, info in enumerate(content):
            file.write(f"### Section {idx+1}\n")
            file.write(f"{info}")

In [124]:
def save_faq_to_json(faq_content, filename="faq_test_data.json"):
    """
    Save the extracted FAQ content to a JSON file for testing.
    """
    file_path = os.path.join("../data/faq_test_data", filename)
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    
    with open(file_path, "w", encoding="utf-8") as json_file:
        json.dump(faq_content, json_file, indent=4)
    
    print(f"FAQ content saved to {file_path}")

In [125]:
def create_readme(full_content, urls, filename="data.md"):
    """
    Create the markdown files with content and metadata.
    """
    for idx, content_dict in enumerate(full_content):
        # Check if the content is for an article (dict) or FAQ content (list)
        if isinstance(content_dict, dict) and not content_dict.get("error"):
            # Create a Markdown file for articles
            filename = get_article_path(idx, urls[idx])
            if os.path.exists(filename):
                continue
            print("Creating:", filename)
            create_markdown_file(content_dict['content'], content_dict, filename)
        elif isinstance(content_dict, list):
            # Save FAQ content to a JSON file for testing
            save_faq_to_json(content_dict)


Extract information form the URLs

In [139]:
all_content = [extract_information(url) for url in urls]
faq_content = faq_main_page(dir_url)  # Fetch FAQ content
all_content.append(faq_content)
urls.append(dir_url) 
# Create the markdown files and  the list of urls to be used in the next step.
create_readme(all_content, urls)

Creating: how-do-i-attach.md
Creating: how-do-i-attach.md
Creating: attaching-a-source-to.md
Creating: how-do-i-attach.md
Creating: detaching-sources-that-should.md
Creating: understanding-the-focus-person.md
Creating: how-do-i-add.md
Creating: how-do-i-create.md
Creating: using-source-linker-to.md
Creating: how-do-i-correct.md
Creating: how-do-i-print.md
Creating: new-look-and-feel.md
Creating: what-to-do-when.md
Creating: viewing-the-record-or.md
Creating: how-do-i-handle.md
Creating: entering-a-reason-statement.md
Creating: creating-new-people-in.md
Creating: unfinished-attachments-how-to.md
Creating: view-records-and-tree.md
Creating: how-do-i-create.md
Creating: attaching-a-source-to.md
Creating: what-are-the-benefits.md
Creating: not-your-family-find.md
Creating: how-do-i-use.md
Creating: what-are-record-hints.md
Creating: understanding-the-source-linker.md
Creating: from-my-source-box.md
Creating: how-do-i-copy.md
Creating: using-drag-and-drop.md
Creating: how-do-i-add.md
Creati