In [1]:
from bs4 import BeautifulSoup
import re
import os
import requests
import time


In [None]:
NAMES = [
    'russell-m-nelson',
    'dallin-h-oaks',
    'henry-b-eyring',
    'jeffrey-r-holland',
    'dieter-f-uchtdorf',
    'david-a-bednar',
    'quentin-l-cook',
    'd-todd-christofferson',
    'neil-l-andersen',
    'ronald-a-rasband',
    'gary-e-stevenson',
    'dale-g-renlund',
    'gerrit-w-gong',
    'ulisses-soares',
    'patrick-kearon'
]
BASE_FOLDER = 'speakers'
BASE_URL = 'https://www.churchofjesuschrist.org'

In [10]:
class SpeakerScraper:
    def __init__(self, names):
        self.names = names
        self.info = {}

    def scrape(self):
        """Main function to scrape and create text files for each speaker."""

        for name in self.names:
            print(f"Scraping {name}...")
            # Fetch the pertinent content from the speaker's page
            links, filenames = self._extract_html_content(name)
            # Add the speaker's info to the dictionary
            self.info[name] = {
                "filenames": filenames,
                "links": links
            }
            time.sleep(1) # Respect scraping speed
    

    def prep(self):
        """Prepares the directory structure and empty text files for each speaker."""

        for name in self.names:
            self._prepare_empty_txts(name, self.info[name]["filenames"])



    def _fetch_html(self, url):
        """Fetches HTML content from a webpage and parses it with BeautifulSoup."""

        headers = {"User-Agent": "Mozilla/5.0"}  # Mimic a real browser
        response = requests.get(url, headers=headers) # Fetch the page's HTML content

        if response.status_code == 200: # If the request was successful
            # Decode the content using auto-detected encoding
            html_content = response.content.decode(response.apparent_encoding, errors="replace")

            # Parse the HTML content with BeautifulSoup
            soup = BeautifulSoup(html_content, "lxml")

            # Find the main section of the page
            # This is the specific div that always contains the list of the speaker's talks
            main_section = soup.find("div", class_="libraryGridLayout-AAbjO")

            # Ensure it was found before proceeding
            if main_section:
                # Convert back to BeautifulSoup object
                soup = BeautifulSoup(str(main_section), "lxml")
            else:
                print("Warning: <div class='libraryGridLayout-AAbjO'> not found. Using full page instead.")

            # Return the parsed HTML content
            return soup
        else: # If the request failed
            # Print the status code
            print(f"Failed to fetch {url}: HTTP {response.status_code}")
            return None
        
        
    def _clean_text(self, text):
        """Removes tabs, newlines, and fixes encoding issues."""

        text = text.replace("\t", " ")   # Remove tab characters
        text = text.replace("\n", " ")   # Remove newlines
        text = text.replace("\r", " ")   # Remove carriage returns
        text = re.sub(r"\s+", " ", text) # Replace multiple spaces with a single space
        return text.strip()  # Trim leading/trailing spaces
        
        
    def _format_conference_session(self, text):
        """Converts month-year format to a specific string format."""

        # Match the month and year of the conference session
        match = re.search(r"(April|October) (\d{4})", text)

        # If a match is found, format it accordingly
        # "April 2000" -> "2000ac" and "October 2000" -> "2000oc"
        if match:
            month, year = match.groups()
            letter = "a" if month == "April" else "o"
            return f"{year}{letter}c" # c is for conference
        
        return text # Returns unchanged if it doesn't match expected format


    def _format_title(self, text):
        """Formats titles by removing unwanted characters and converting to lowercase."""
        # Example: "Be Ye Therefore Perfect—Eventually" -> "be-ye-therefore-perfect-eventually"

        text = text.lower() # Convert to lowercase
        text = re.sub(r"[—–]", "-", text) # Convert em and en dashes to hyphens
        text = re.sub(r"[^\w\s-]", "", text) # Remove punctuation (keep words, hyphens, and spaces)
        text = text.replace(" ", "-") # Replace spaces with hyphens

        return text
        
    def _extract_links(self, soup):
        """Extracts all <a> tags with their text and href attributes."""

        links = []

        # Find all <a> tags in the soup object
        for tag in soup.find_all("a"):
            # Get the href attribute (default to empty string if missing)
            href = tag.get("href", "")
            # Complete the URL by appending it to the Church's website base URL
            # (if the href starts with "/", it is a relative URL)
            link = BASE_URL + href if href.startswith("/") else href
            links.append(link)

        links.reverse() # Reverse the order of links (oldest talk to newest)

        return links
    

    def _extract_filenames(self, soup):
        """Extracts filenames from the soup object."""

        # Find all <h6> and <h4> tags
        h6_tags = soup.find_all("h6") # These are the conference sessions
        h4_tags = soup.find_all("h4") # These are the titles of the talks

        # Extract text content
        h6_texts = [tag.get_text(strip=True) for tag in h6_tags]
        h4_texts = [tag.get_text(strip=True) for tag in h4_tags]

        # Convert all h6 contents
        formatted_h6s = [self._format_conference_session(self._clean_text(text)) for text in h6_texts]
        # Convert all h4 contents
        formatted_h4s = [self._format_title(self._clean_text(text)) for text in h4_texts]
        # Combine the two lists into complete filenames
        filenames = [f"{h6}_{h4}.txt" for h6, h4 in zip(formatted_h6s, formatted_h4s)]

        # Reverse the order of filenames (oldest talks to newest)
        filenames.reverse()

        return filenames

    def _extract_html_content(self, name):
        """Extracts HTML content from a speaker's page and prepares filenames."""

        # Construct the URL for the speaker's page
        url = f'{BASE_URL}/study/general-conference/speakers/{name}?lang=eng'

        # Parse HTML
        soup = self._fetch_html(url)

        if soup: # If the HTML was successfully fetched and parsed
            # Extract the filenames and links from the HTML content
            filenames = self._extract_filenames(soup)
            links   =   self._extract_links(soup)
            return links, filenames
        else: # If the HTML parsing failed
            print(f"Failed to parse HTML for {name}.")
            return [], []

    

    def _prepare_empty_txts(self, name, filenames):
        """Prepares the directory structure and creates empty text files for each speaker."""

        named_folder = os.path.join(BASE_FOLDER, name)
        # Create the folder if it doesn't exist
        os.makedirs(named_folder, exist_ok=True)

        # Create empty .txt files
        for filename in filenames:
            file_path = os.path.join(named_folder, filename)
            with open(file_path, "w", encoding="utf-8") as _:
                pass  # Create an empty file

        # Print results
        print(f"Created {len(filenames)} text files in '{name}' folder.")

In [23]:
class TalkScraper:
    def __init__(self, speaker_name, speaker_dict):
        self.name = speaker_name
        self.info = speaker_dict

    def scrape(self):
        """Main function to scrape and fill text files for each talk."""

        # Check if the number of links and filenames match
        length = len(self.info["filenames"])
        if len(self.info["links"]) != length:
            print(f"Warning: Number of links ({len(self.info['links'])}) does not match number of filenames ({len(self.info['filenames'])}).")
            return
        
        # Loop through each talk (filename and link)
        for i in range(length):
            # Grab and scrape the link
            link = self.info["links"][i]
            print(f"Scraping {link}...")
            soup = self._fetch_html(link)

            if soup: # If the HTML was successfully fetched and parsed
                # Parse the talk content and save it to the corresponding text file
                talk_content = self._parse_talk(soup)
                filename = self.info["filenames"][i]
                self._save_talk_content(filename, talk_content)

            time.sleep(1) # Respect scraping speed


    def _fetch_html(self, url):
        """Fetches HTML content from a webpage and parses it with BeautifulSoup."""

        headers = {"User-Agent": "Mozilla/5.0"}  # Mimic a real browser
        # Fetch the page's HTML content
        response = requests.get(url, headers=headers)

        if response.status_code == 200: # If the request was successful
            # Decode the content using auto-detected encoding
            html_content = response.content.decode(response.apparent_encoding, errors="replace")
            soup = BeautifulSoup(html_content, "lxml") # Parse with BeautifulSoup

            # <section id="content"> is the main section of the page, containing the talk content
            content = soup.find("section", id="content")

            if content: # If the section was found
                soup = BeautifulSoup(str(content), "lxml") # Convert back to BeautifulSoup object

                # <div class="body"> inside <section id="content"> is the main content of the talk
                body = soup.find("div", class_="body")

                if body: # If the div was found
                    soup = BeautifulSoup(str(body), "lxml") # Convert back to BeautifulSoup object
                else: # If the div was not found, use the section instead
                    print("Warning: <div class='body'> not found. Using <section id='content'> instead.")
            else: # If the section was not found, use the full page instead
                print("Warning: <section id='content'> not found. Using full page instead.")

            return soup
        else: # If the request failed
            print(f"Failed to fetch {url}: HTTP {response.status_code}")
            return None
    
    def _parse_talk(self, soup):
        """Parses the talk content from the HTML."""

        # Find and remove the footer from the soup
        # The footer contains the notes,
            # and isn't a good representation of speaking content
        footer = soup.find("footer")
        if footer:
            footer.extract()

        paragraphs = []

        # Extract all <p> tags, prepending "- " to list items
        for p in soup.find_all("p"):
            text = p.get_text() # Extract plaintext
            is_list_item = p.find_parent("li") is not None  # Check if inside <li>
            
            if is_list_item:
                text = f"- {text}" # Prepend "- " if it's a list item
            
            paragraphs.append(text)

        # Join paragraphs into a single string
        talk_content = "\n".join(paragraphs)

        return talk_content
    
    def _save_talk_content(self, filename, content):
        """Saves the talk content to a text file."""
        
        filename = os.path.join(BASE_FOLDER, self.name, filename)
        with open(filename, "w", encoding="utf-8") as file:
            file.write(content)  # Write the content to the file
        

In [None]:
scraper = SpeakerScraper(NAMES)
scraper.scrape()
print("\nSpeaker scraping completed.")

Scraping russell-m-nelson...
Scraping dallin-h-oaks...
Scraping henry-b-eyring...
Scraping jeffrey-r-holland...
Scraping dieter-f-uchtdorf...
Scraping david-a-bednar...
Scraping quentin-l-cook...
Scraping d-todd-christofferson...
Scraping neil-l-andersen...
Scraping ronald-a-rasband...
Scraping gary-e-stevenson...
Scraping dale-g-renlund...
Scraping gerrit-w-gong...
Scraping ulisses-soares...
Scraping patrick-kearon...
Speaker scraping completed.


In [6]:
scraper.prep()

Created 113 text files in 'russell-m-nelson' folder.
Created 98 text files in 'dallin-h-oaks' folder.
Created 112 text files in 'henry-b-eyring' folder.
Created 62 text files in 'jeffrey-r-holland' folder.
Created 73 text files in 'dieter-f-uchtdorf' folder.
Created 41 text files in 'david-a-bednar' folder.
Created 37 text files in 'quentin-l-cook' folder.
Created 40 text files in 'd-todd-christofferson' folder.
Created 37 text files in 'neil-l-andersen' folder.
Created 25 text files in 'ronald-a-rasband' folder.
Created 22 text files in 'gary-e-stevenson' folder.
Created 21 text files in 'dale-g-renlund' folder.
Created 16 text files in 'gerrit-w-gong' folder.
Created 19 text files in 'ulisses-soares' folder.
Created 5 text files in 'patrick-kearon' folder.


In [7]:
speakers = scraper.info

In [None]:
for name in NAMES:
    print(f"Scraping talks for {name}...")
    talk_scraper = TalkScraper(name, speakers[name])
    talk_scraper.scrape()
    print(f"Talk scraping for {name} completed.\n")
    time.sleep(1)

Scraping talks for russell-m-nelson...
Scraping https://www.churchofjesuschrist.org/study/general-conference/1984/04/call-to-the-holy-apostleship...
Scraping https://www.churchofjesuschrist.org/study/general-conference/1984/10/protect-the-spiritual-power-line...
Scraping https://www.churchofjesuschrist.org/study/general-conference/1985/04/reverence-for-life...
Scraping https://www.churchofjesuschrist.org/study/general-conference/1985/10/self-mastery...
Scraping https://www.churchofjesuschrist.org/study/general-conference/1986/04/in-the-lords-own-way...
Scraping https://www.churchofjesuschrist.org/study/general-conference/1986/10/joy-cometh-in-the-morning...
Scraping https://www.churchofjesuschrist.org/study/general-conference/1987/04/life-after-life...
Scraping https://www.churchofjesuschrist.org/study/general-conference/1987/10/keys-of-the-priesthood...
Scraping https://www.churchofjesuschrist.org/study/general-conference/1987/10/lessons-from-eve...
Scraping https://www.churchofjesusc

In [None]:
def combine(name):
    """Merges all text files in a folder into a single file."""

    # Mark the folder with files to be merged
    folder = os.path.join(BASE_FOLDER, name)

    merged_content = []

    # Loop through all files in the folder
    for filename in os.listdir(folder):
        if filename.endswith(".txt"):
            with open(os.path.join(folder, filename), "r", encoding="utf-8") as file:
                # The first two lines are the speaker's name and their calling
                # Since they are repeated each time, skip them.
                next(file)
                next(file)

                # Read the rest of the file and append it to the merged content list
                content = file.read()
                merged_content.append(content)

    # Write the merged content to a new file
    with open(os.path.join(BASE_FOLDER, f"{name}.txt"), "w", encoding="utf-8") as merged_file:
        merged_file.write("\n\n".join(merged_content)) # Separate each talk with two newlines

In [22]:
for name in NAMES:
    combine(name)