In [1]:
import time
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
def get_urls():
    """ Historic list of urls we're scraping from as well as the current one. Needs to be changed when a new thread
    is made"""

    url_list = [
        #'https://www.mumsnet.com/talk/am_i_being_unreasonable/4676538-if-you-like-wordle-plusword-is-even-better-thread-4?page=',
        #'https://www.mumsnet.com/talk/_chat/4714295-plusword-new-thread-1?page=',
        'https://www.mumsnet.com/talk/_chat/4765702-plusword-new-thread-2?page=']

    return url_list

In [None]:
def post_to_text_converter(post, whole_post_list):
    
    # converts to list
    meta_data = post[0].getText().split()

    # removes fullstops in position 1
    meta_data.pop(1)

    # converts text to list and then joins items together
    post_text = original_post_paragraphs[1].getText().split()
    post_text = ' '.join(post_text)

    # Adds OP metadata and text together and adds together for OP on every page
    meta_data.append(post_text)
    whole_post = meta_data
    whole_post_list.append(whole_post)
    
    return whole_post_list

In [3]:
def original_scraper(url_list):
    """ Loops over all the different types of posts on the mumsnet website, accessing the text values. Then appends
    them to a list which is finally converted to a dataframe and returned"""

    whole_post_list = []

    # maximum number of pages in thread
    max_pages = 41

    # html class of original post from the thread
    first_post_class = 'p-4 pb-1 pt-2.5 lg:py-2.5 mt-2.5 lg:mt-1.5 border-t border-b sm:border sm:rounded ' \
                       'border-mumsnet-forest-border bg-mumsnet-forest dark:bg-mumsnet-forest-dark'

    # html class of a normal post from the thread
    normal_reply_class = 'lg:py-2.5 pt-2.5 pb-1 p-4 border-t border-b sm:border sm:rounded mt-1.5 overflow-x-hidden ' \
                         'bg-white dark:bg-gray-800 border-gray-200'

    # html class of a post from the thread creator
    original_poster_reply_class = 'lg:py-2.5 pt-2.5 pb-1 p-4 border-t border-b sm:border sm:rounded mt-1.5 ' \
                                  'overflow-x-hidden bg-mumsnet-forest dark:bg-mumsnet-forest-dark ' \
                                  'border-mumsnet-forest-border'

    for url in url_list:

        # Increments through every page on website until it runs out for hits max_pages
        for page_number in range(max_pages):

            try:

                # gets request via bs4
                r = requests.get(url + str(page_number))
                soup = BeautifulSoup(r.content, features="html5lib")
                
                # Finds original post on first page and splits it into metadata and post text
                original_post = soup.find_all('div', class_=first_post_class)
                original_post_paragraphs = original_post[0].find_all('p')

                # converts to list
                meta_data = original_post_paragraphs[0].getText().split()

                # removes fullstops in position 1
                meta_data.pop(1)

                # converts text to list and then joins items together
                post_text = original_post_paragraphs[1].getText().split()
                post_text = ' '.join(post_text)

                # Adds OP metadata and text together and adds together for OP on every page
                meta_data.append(post_text)
                whole_post = meta_data
                whole_post_list.append(whole_post)

                # finds all non-OP post on page and gets data
                posts = soup.find_all('div', class_=[normal_reply_class, original_poster_reply_class])

                for post in posts:
                    post_info = post.getText().split()

                    # first 4 items are meta data
                    meta_data = post_info[:4]

                    # removes unneeded full stop
                    meta_data.pop(1)

                    # joins post text together
                    post_text = post_info[4:]
                    post_text = ' '.join(post_text)

                    # appends metadata and text together and adds to list
                    meta_data.append(post_text)
                    whole_post = meta_data
                    whole_post_list.append(whole_post)

            except Exception as e:
                print(e)
            pass

    df = pd.DataFrame(whole_post_list, columns=['user', 'date', 'time', 'text'])

    return df

In [4]:
def modified_scraper(url_list):
    """ Loops over all the different types of posts on the mumsnet website, accessing the text values. Then appends
    them to a list which is finally converted to a dataframe and returned"""

    whole_post_list = []

    # maximum number of pages in thread
    max_pages = 41

    # html class of original post from the thread
    first_post_class = 'p-4 pb-1 pt-2.5 lg:py-2.5 mt-2.5 lg:mt-1.5 border-t border-b sm:border sm:rounded ' \
                       'border-mumsnet-forest-border bg-mumsnet-forest dark:bg-mumsnet-forest-dark'

    # html class of a normal post from the thread
    normal_reply_class = 'lg:py-2.5 pt-2.5 pb-1 p-4 border-t border-b sm:border sm:rounded mt-1.5 overflow-x-hidden ' \
                         'bg-white dark:bg-gray-800 border-gray-200'

    # html class of a post from the thread creator
    original_poster_reply_class = 'mt-2.5 leading-6 prose prose-bullets-dark w-full break-words dark:text-white'

    for url in url_list:

        # Increments through every page on website until it runs out for hits max_pages
        for page_number in range(max_pages):

            try:

                # gets request via bs4
                r = requests.get(url + str(page_number))
                soup = BeautifulSoup(r.content, features="html5lib")
                
                if page_number == 1:

                    # Finds original post on first page and splits it into metadata and post text
                    original_post = soup.find_all('div', class_=first_post_class)
                    original_post = original_post[0].find_all('p')
                    
                    whole_post_list = post_to_text_converter(original_post, whole_post_list)

                # finds all non-OP post on page and gets data
                posts = soup.find_all('div', class_=[normal_reply_class, original_poster_reply_class])

                for post in posts:
                    post_info = post.getText().split()

                    # first 4 items are meta data
                    meta_data = post_info[:4]

                    # removes unneeded full stop
                    meta_data.pop(1)

                    # joins post text together
                    post_text = post_info[4:]
                    post_text = ' '.join(post_text)

                    # appends metadata and text together and adds to list
                    meta_data.append(post_text)
                    whole_post = meta_data
                    whole_post_list.append(whole_post)

            except Exception as e:
                print(e)
            pass

    df = pd.DataFrame(whole_post_list, columns=['user', 'date', 'time', 'text'])

    return df

In [5]:
url_list = get_urls()
original_start = time.time()
original_df = original_scraper(url_list)
original_end = time.time()
original_time = original_end - original_start

In [6]:
original_time

34.69722604751587

In [7]:
url_list = get_urls()
modified_start = time.time()
modified_df = modified_scraper(url_list)
modified_end = time.time()
modified_time = modified_end - modified_start

In [8]:
modified_time

34.45286321640015

In [9]:
original_df

Unnamed: 0,user,date,time,text
0,Sunbird24,18/03/2023,07:29,Previous thread: www.mumsnet.com/talk/_chat/47...
1,bruffin,19/03/2023,19:16,marking my spot Add message Save Share Report ...
2,MarmiteWine,19/03/2023,20:38,00:45 today Add message Save Share Report Book...
3,Drywhitefruitycidergin,20/03/2023,00:54,⏱️ I just completed PlusWord in 02:47 www.tele...
4,Drywhitefruitycidergin,20/03/2023,00:55,*thread ffs - that's why I'm so slow at pw too...
...,...,...,...,...
1059,sanityisamyth,21/03/2023,05:44,⏱️ I just completed PlusWord in 01:03 www.tele...
1060,Drywhitefruitycidergin,21/03/2023,06:24,⏱️ I just completed PlusWord in 04:04 www.tele...
1061,DadDadDad,21/03/2023,07:04,1:27 for me today. Add message Save Share Repo...
1062,Madcats,21/03/2023,09:40,It took me a while to understand the answer to...


In [10]:
modified_df

Unnamed: 0,user,date,time,text
0,bruffin,19/03/2023,19:16,marking my spot Add message Save Share Report ...
1,MarmiteWine,19/03/2023,20:38,00:45 today Add message Save Share Report Book...
2,Drywhitefruitycidergin,20/03/2023,00:54,⏱️ I just completed PlusWord in 02:47 www.tele...
3,Drywhitefruitycidergin,20/03/2023,00:55,*thread ffs - that's why I'm so slow at pw too...
4,sanityisamyth,20/03/2023,06:43,⏱️ I just completed PlusWord in 06:33 www.tele...
...,...,...,...,...
1019,sanityisamyth,21/03/2023,05:44,⏱️ I just completed PlusWord in 01:03 www.tele...
1020,Drywhitefruitycidergin,21/03/2023,06:24,⏱️ I just completed PlusWord in 04:04 www.tele...
1021,DadDadDad,21/03/2023,07:04,1:27 for me today. Add message Save Share Repo...
1022,Madcats,21/03/2023,09:40,It took me a while to understand the answer to...
