In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

def get_page(page_num):
    """ gets the html of a story library webpage
    
    Args:
        page_num (int): the page number of the library to get
        
    Returns:
        html_str (string): the html content of the story library page
    
    """
    
    # create url of story library page
    url = 'https://themoth.org/story-library/stories/p' + str(page_num)
    
    # get html text with replaced whitespace entity
    return requests.get(url).text.replace('&nbsp;', ' ')

def get_page_count(html_str):
    """ gets the number of pages in the story library
    
    Args:
        html_str (string): the html content of a moth story library page
        
    Returns:
        page_count (int): the number of pages
    
    """
    
    # build soup object from story page html text
    soup = BeautifulSoup(html_str)
    
    # get pagination string ("Showing page x of page_count")
    page_indicator = soup.find(class_='paginator').find(class_='center').text
    
    # extract page_count
    page_count_str = page_indicator.split('of')[1].strip()
    return int(page_count_str)

def clean_stories(html_str):
    """ web scrapes the moth stories
    
    Args:
        html_str (string): the html content of a moth story library page
        
    Returns:
        stories_df (pd.DataFrame): the dataframe of stories listed in the story library page
    
    """
    
    # initialize df of stories
    stories_df = pd.DataFrame()
    
    # build soup object from story page html text
    soup = BeautifulSoup(html_str)

    # get all story detail items
    stories = soup.find_all(class_='story-content')

    # add each story to df
    for story in stories:
        # get story details
        details = story.find(class_='details')
        
        # get desired details
        title = details.find('h6').text.strip()
        speaker = details.find(class_='serif').find(class_='pill').text.strip()
        length = details.find('h5').text.strip()
        
        # find description from possible paragraphs
        paragraphs = details.select('p:not(.serif)')
        description = ''
        for p in paragraphs:
            p_text = p.text.strip()
            if p_text != '':
                description = p_text
                break
                    
        # get link to play
        play_id = story.find(class_='actions').find('span').get('data-player-play')
        play_link = 'https://player.themoth.org/#/?actionType=ADD_AND_PLAY&storyId=' + play_id
        
        # create series of story info and append to df
        story_series = pd.Series({
            'title': title,
            'speaker': speaker,
            'length': length,
            'description': description,
            'play_id': play_id,
            'play_link': play_link
        })
        stories_df = stories_df.append(story_series, ignore_index=True)
        
    stories_df = stories_df.set_index('play_id')
    return stories_df

In [2]:
def get_all_stories():
    """ web scrapes all moth stories
        
    Returns:
        library_df (pd.DataFrame): the dataframe of stories listed in the moth story library
    
    """

    # get first page of stories
    first_page = get_page(1)

    # get number of pages in library
    page_count = get_page_count(first_page)

    # initialize list of stories_df (per library page)
    page_stories = list()
    page_stories.append(clean_stories(first_page))

    # get stories of remaining pages
    for page_num in range(page_count):
        page = get_page(page_num)
        page_stories.append(clean_stories(page))

    # combine into one df
    library_df = pd.concat(page_stories)

    # remove duplicates - stories are sometimes listed twice at moth website
    library_df = library_df[~library_df.index.duplicated(keep='first')]
    
    return library_df

In [3]:
library_df = get_all_stories()

# export
library_df.to_csv('library.csv')

# preview
library_df.head()

Unnamed: 0_level_0,title,speaker,length,description,play_link
play_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
32552,Keys to the Castle,Tracey Croisier,09:10,Tracey Crosier interviews for a radio job and ...,https://player.themoth.org/#/?actionType=ADD_A...
32544,The Great Qureshi Car Chase,Omar Qureshi,12:31,Omar Qureshi tries to show his cousin a good t...,https://player.themoth.org/#/?actionType=ADD_A...
32536,My Father's Green Chevrolet,Juliette Holmes,12:19,Juliette Holmes experiences segregation firsth...,https://player.themoth.org/#/?actionType=ADD_A...
32533,"See One, Do One",Jennifer Leahy,06:47,Jennifer Leahy deals with the death of a patie...,https://player.themoth.org/#/?actionType=ADD_A...
32452,Fun-raiser,Michael Corso,15:41,Michael Corso enters a stock car race for blin...,https://player.themoth.org/#/?actionType=ADD_A...
