In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import datetime as dt
import cache_magic

%cache magic is now registered in ipython


### List of mumsnet urls

In [3]:
url_list = ['https://www.mumsnet.com/talk/am_i_being_unreasonable/4676538-if-you-like-wordle-plusword-is-even-better-thread-4?page=',
            'https://www.mumsnet.com/talk/_chat/4714295-plusword-new-thread-1?page=',
            'https://www.mumsnet.com/talk/_chat/4765702-plusword-new-thread-2?page=']

## Scraper

In [4]:
def scraper(url, max_pages, whole_post_list):
    
    # Increments through every page on website until it runs out for hits max_pages
    for page_number in range(max_pages):
        
        try:
            
            # gets request via bs4
            r = requests.get(url + str(page_number))
            soup = BeautifulSoup(r.content)
            
            # Finds original post on each page and splits it into metadata and post text
            original_post = soup.find_all('div', class_= 'p-4 pb-1 pt-2.5 lg:py-2.5 mt-2.5 lg:mt-1.5 border-t border-b sm:border sm:rounded border-mumsnet-forest-border bg-mumsnet-forest dark:bg-mumsnet-forest-dark')
            original_post_paragraphs=original_post[0].find_all('p')
            
            # converts to list
            meta_data = original_post_paragraphs[0].getText().split()
            
            # removes fullstop in position 1
            meta_data.pop(1)
            
            # converts text to list and then joins items together
            post_text = original_post_paragraphs[1].getText().split()
            post_text =' '.join(post_text)
            
            # Adds OP metadata and text together and adds together for OP on every page
            meta_data.append(post_text)
            whole_post = meta_data
            whole_post_list.append(whole_post)
            
            # finds all non-OP post on page and gets data
            posts= soup.find_all('div', class_='lg:py-2.5 pt-2.5 pb-1 p-4 border-t border-b sm:border sm:rounded mt-1.5 overflow-x-hidden bg-white dark:bg-gray-800 border-gray-200')
            for post in posts:
                post_info = post.getText().split()
                
                #first 4 items are meta data
                meta_data = post_info[:4]
                
                #removes uneeded full stop
                meta_data.pop(1)
                
               # joins post text together
                post_text = post_info[4:]
                post_text = ' '.join(post_text)
                
                
                # appends metadata and text together and adds to list
                meta_data.append(post_text)
                whole_post = meta_data
                whole_post_list.append(whole_post)
            
        except:
            pass

    return whole_post_list

## Scraper initialization and df generation

In [5]:
whole_post_list=[]

# maxiumum number of pages in thread
max_posts = 41

for url in url_list:

    whole_post_list = scraper(url, max_posts, whole_post_list)
            
df = pd.DataFrame(whole_post_list, columns=['user', 'date', 'time', 'text'])

### Converts 'Today' and 'Yesterday to date values, creates and sorts by timestamp

In [6]:
df['date'] = df['date'].str.replace('Yesterday', dt.datetime.strftime((dt.datetime.today() - dt.timedelta(days=1)), '%d/%m/%Y'))
df['date'] = df['date'].str.replace('Today', dt.datetime.strftime(dt.datetime.today(), '%d/%m/%Y'))
df['timestamp'] = pd.to_datetime(df['date'] + ' ' + (df['time']+':00'), format='%d/%m/%Y %H:%M:%S')
df = df.sort_values(by=['timestamp'])

Unnamed: 0,user,date,time,text,timestamp
0,ILoveAllRainbowsx,13/11/2022,14:18,Previous thread: www.mumsnet.com/talk/am_i_bei...,2022-11-13 14:18:00
436,ILoveAllRainbowsx,13/11/2022,14:18,Previous thread: www.mumsnet.com/talk/am_i_bei...,2022-11-13 14:18:00
412,ILoveAllRainbowsx,13/11/2022,14:18,Previous thread: www.mumsnet.com/talk/am_i_bei...,2022-11-13 14:18:00
847,ILoveAllRainbowsx,13/11/2022,14:18,Previous thread: www.mumsnet.com/talk/am_i_bei...,2022-11-13 14:18:00
389,ILoveAllRainbowsx,13/11/2022,14:18,Previous thread: www.mumsnet.com/talk/am_i_bei...,2022-11-13 14:18:00
...,...,...,...,...,...
2279,Floralnomad,17/04/2023,01:30,00:54 nice one today Add message Save Share Re...,2023-04-17 01:30:00
2281,DadDadDad,17/04/2023,07:08,1:27 for me. Took a while to get started on th...,2023-04-17 07:08:00
2282,Albaniarocks,17/04/2023,07:21,⏱️ I just completed PlusWord in 00:52 www.tele...,2023-04-17 07:21:00
2283,JoyDivisionOvenGlovesx,17/04/2023,07:41,⏱️ I just completed PlusWord in 01:12 Another ...,2023-04-17 07:41:00


### Extracts times from text and adds 00: to allow it to handle hours

In [7]:
df['text'] =df['text'].str.extract(r'(\d*\d:\d\d)')
df = df.dropna(subset='text')
df = df.copy()
df['text'] =df['text'].str.replace(r'(^\d:\d\d)', r'0\1', regex=True)
df['text'] = '00:' + df['text']

### Drops duplicate entries for users on same date, drops columns and renames

In [10]:
df = df.copy()
df= df.drop_duplicates(subset=['user', 'date'])
df = df.drop(columns=['date', 'time'])
df = df.rename(columns={'text' : 'time'})
df = df[['timestamp', 'user', 'time']]

In [11]:
df

Unnamed: 0,timestamp,user,time
949,2022-11-13 17:05:00,BrilliantGreenFlamingo,00:04:03
925,2022-11-13 17:20:00,MarmiteWine,00:01:41
39,2022-11-14 00:09:00,Drywhitefruitycidergin,00:02:19
956,2022-11-14 00:51:00,Floralnomad,00:01:21
16,2022-11-14 02:48:00,Sunbird24,00:00:53
...,...,...,...
2279,2023-04-17 01:30:00,Floralnomad,00:00:54
2281,2023-04-17 07:08:00,DadDadDad,00:01:27
2282,2023-04-17 07:21:00,Albaniarocks,00:00:52
2283,2023-04-17 07:41:00,JoyDivisionOvenGlovesx,00:01:12


### Prints timestamped csv

In [23]:
df.to_csv('data/historical_mumsnet_data_' + str(dt.datetime.now()) + '.csv', index=False)