In [1]:
import requests
from bs4 import BeautifulSoup, Comment
import csv

# Function to scrape story URLs from a page
def scrape_story_urls(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    # Find all story links on the page
    story_links = soup.find_all("a", href=True)
    story_urls = []
    for link in story_links:
        href = link["href"]
        # Check if the link leads to a story page
        if "/cbes/" in href and "?Route=stories" in href:
            story_url = "https://www.duchas.ie" + href
            story_urls.append(story_url)
    return story_urls

# Function to scrape story content from a page
def scrape_story_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    # Find all instances of the transcription HTML comment tags
    comment_elements = soup.find_all(string=lambda text: isinstance(text, Comment) and "HTML_TAG_START" in text)
    
    transcriptions = []
    for comment in comment_elements:
        # Find the parent of the comment which should be the tag containing the transcription
        parent = comment.find_parent()
        if parent:
            transcription = parent.get_text(strip=True)
            transcriptions.append(transcription)
    
    # Return the longest transcription found
    if transcriptions:
        return max(transcriptions, key=len)
    
    return None

# Function to scrape story title from a page
def scrape_story_title(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    # Find the title element on the page
    title_element = soup.find("title")
    if title_element:
        title = title_element.text.strip()
        return title
    else:
        return None

# Scrape story URLs from all pages
all_story_urls = []
for page in range(1, 170):
    base_url = f"https://www.duchas.ie/en/cbes/stories?TopicID=4427742&Page={page}&PerPage=20&LanguageID=ga&Transcribed=true"
    print(f"Scraping story URLs from page {page}: {base_url}")
    story_urls = scrape_story_urls(base_url)
    all_story_urls.extend(story_urls)

# Prepare CSV file
csv_file = 'duchas_stories_irish.csv'
csv_columns = ['Title', 'Transcription']

# Open CSV file for writing
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
    writer.writeheader()

    # Scrape transcription and title for each URL and write to CSV
    for url in all_story_urls:
        print("Scraping transcription and title for:", url)
        title = scrape_story_title(url)
        transcription = scrape_story_content(url)
        if title and transcription:
            writer.writerow({'Title': title, 'Transcription': transcription})

print(f"Data has been written to {csv_file}")


Scraping story URLs from page 1: https://www.duchas.ie/en/cbes/stories?TopicID=4427742&Page=1&PerPage=20&LanguageID=ga&Transcribed=true
Scraping story URLs from page 2: https://www.duchas.ie/en/cbes/stories?TopicID=4427742&Page=2&PerPage=20&LanguageID=ga&Transcribed=true
Scraping story URLs from page 3: https://www.duchas.ie/en/cbes/stories?TopicID=4427742&Page=3&PerPage=20&LanguageID=ga&Transcribed=true
Scraping story URLs from page 4: https://www.duchas.ie/en/cbes/stories?TopicID=4427742&Page=4&PerPage=20&LanguageID=ga&Transcribed=true
Scraping story URLs from page 5: https://www.duchas.ie/en/cbes/stories?TopicID=4427742&Page=5&PerPage=20&LanguageID=ga&Transcribed=true
Scraping story URLs from page 6: https://www.duchas.ie/en/cbes/stories?TopicID=4427742&Page=6&PerPage=20&LanguageID=ga&Transcribed=true
Scraping story URLs from page 7: https://www.duchas.ie/en/cbes/stories?TopicID=4427742&Page=7&PerPage=20&LanguageID=ga&Transcribed=true
Scraping story URLs from page 8: https://www.duc