In [7]:
!pip install requests beautifulsoup4 pandas




In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Paste your list of 6 talk URLs here
talk_urls = [
    "https://www.churchofjesuschrist.org/study/general-conference/2025/04/41bednar?lang=eng",
    "https://www.churchofjesuschrist.org/study/general-conference/2025/04/47oaks?lang=eng",
    "https://www.churchofjesuschrist.org/study/general-conference/2025/04/57nelson?lang=eng",
    "https://www.churchofjesuschrist.org/study/general-conference/2025/04/13holland?lang=eng",
    "https://www.churchofjesuschrist.org/study/general-conference/2025/04/18eyring?lang=eng",
    "https://www.churchofjesuschrist.org/study/general-conference/2025/04/27uchtdorf?lang=eng"
]

# Step 2: Initialize data lists
titles = []
speakers = []
roles = []
contents = []
sources = []

# Step 3: Loop through each URL
for url in talk_urls:
    print("Scraping:", url)
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Get title
        meta_title = soup.find('meta', property='og:title')
        title = meta_title['content'] if meta_title else 'N/A'

        # Get speaker
        speaker_tag = soup.find('p', class_='author-name')
        speaker = speaker_tag.get_text(strip=True) if speaker_tag else 'N/A'

        # Get speaker role
        role_tag = soup.find('p', class_='author-role')
        role = role_tag.get_text(strip=True) if role_tag else 'N/A'

        # Get content
        content_div = soup.find('div', class_='body-block')
        paragraphs = content_div.find_all('p') if content_div else []
        content = '\n'.join(p.get_text(strip=True) for p in paragraphs)

        # Final check: If any required field is missing, we still record the data but mark what's missing
        if not content:
            print("⚠️ Warning: Content not found for", url)

        # Append all scraped fields to respective lists
        titles.append(title)
        speakers.append(speaker)
        roles.append(role)
        contents.append(content)
        sources.append(url)

    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")
        # Fill missing entry with "N/A" so index alignment stays correct
        titles.append("N/A")
        speakers.append("N/A")
        roles.append("N/A")
        contents.append("N/A")
        sources.append(url)

# Step 4: Build the DataFrame
df = pd.DataFrame({
    'title': titles,
    'speaker': speakers,
    'speaker_role': roles,
    'content': contents,
    'source': sources
})

# Step 5: Display the DataFrame shape and preview
print("\n✅ Total talks scraped:", df.shape[0])
print(df.head(10))


Scraping: https://www.churchofjesuschrist.org/study/general-conference/2025/04/41bednar?lang=eng
Scraping: https://www.churchofjesuschrist.org/study/general-conference/2025/04/47oaks?lang=eng
Scraping: https://www.churchofjesuschrist.org/study/general-conference/2025/04/57nelson?lang=eng
Scraping: https://www.churchofjesuschrist.org/study/general-conference/2025/04/13holland?lang=eng
Scraping: https://www.churchofjesuschrist.org/study/general-conference/2025/04/18eyring?lang=eng
Scraping: https://www.churchofjesuschrist.org/study/general-conference/2025/04/27uchtdorf?lang=eng

✅ Total talks scraped: 6
                                               title  \
0             The Times of Restitution of All Things   
1                         Divine Helps for Mortality   
2                  Confidence in the Presence of God   
3                                  As a Little Child   
4                                “Draw Near unto Me”   
5  “By This All Will Know That You Are My Disciples”   

In [24]:
# Save scraped data to CSV
df.to_csv("Data/general_conference_talks.csv", index=False)
print("✅ File saved successfully to Data/general_conference_talks.csv")


✅ File saved successfully to Data/general_conference_talks.csv
