In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import datetime as dt
import json
import pymongo
from plotting_streamlit import data_import, scraper, get_db_client, data_export

### List of mumsnet urls

In [2]:
url_list = [#'https://www.mumsnet.com/talk/am_i_being_unreasonable/4676538-if-you-like-wordle-plusword-is-even-better-thread-4?page=',
            #'https://www.mumsnet.com/talk/_chat/4714295-plusword-new-thread-1?page=',
            'https://www.mumsnet.com/talk/_chat/4765702-plusword-new-thread-2?page=']
            

In [4]:
def scraper(url, max_pages, whole_post_list):
    
    # Increments through every page on website until it runs out for hits max_pages
    for page_number in range(max_pages):
        
        try:
            
            # gets request via bs4
            r = requests.get(url + str(page_number))
            soup = BeautifulSoup(r.content)
            
            # Finds original post on each page and splits it into metadata and post text
            original_post = soup.find_all('div', class_= 'p-4 pb-1 pt-2.5 lg:py-2.5 mt-2.5 lg:mt-1.5 border-t border-b sm:border sm:rounded border-mumsnet-forest-border bg-mumsnet-forest dark:bg-mumsnet-forest-dark')
            original_post_paragraphs=original_post[0].find_all('p')
            
            # converts to list
            meta_data = original_post_paragraphs[0].getText().split()
            
            # removes fullstop in position 1
            meta_data.pop(1)
            
            # converts text to list and then joins items together
            post_text = original_post_paragraphs[1].getText().split()
            post_text =' '.join(post_text)
            
            # Adds OP metadata and text together and adds together for OP on every page
            meta_data.append(post_text)
            whole_post = meta_data
            whole_post_list.append(whole_post)
            
            # finds all non-OP post on page and gets data
            posts= soup.find_all('div', class_=['lg:py-2.5 pt-2.5 pb-1 p-4 border-t border-b sm:border sm:rounded mt-1.5 overflow-x-hidden bg-white dark:bg-gray-800 border-gray-200',
                                                'lg:py-2.5 pt-2.5 pb-1 p-4 border-t border-b sm:border sm:rounded mt-1.5 overflow-x-hidden bg-mumsnet-forest dark:bg-mumsnet-forest-dark border-mumsnet-forest-border'])
            for post in posts:
                post_info = post.getText().split()
                
                #first 4 items are meta data
                meta_data = post_info[:4]
                
                #removes uneeded full stop
                meta_data.pop(1)
                
               # joins post text together
                post_text = post_info[4:]
                post_text = ' '.join(post_text)
                
                
                # appends metadata and text together and adds to list
                meta_data.append(post_text)
                whole_post = meta_data
                whole_post_list.append(whole_post)
            
        except:
            pass

    return whole_post_list

## Scraper initialization and df generation

In [6]:
whole_post_list=[]

# maxiumum number of pages in thread
max_pages = 41

for url in url_list:

    whole_post_list = scraper(url, max_pages, whole_post_list)
            
df = pd.DataFrame(whole_post_list, columns=['user', 'date', 'time', 'text'])
df

Unnamed: 0,user,date,time,text
0,Sunbird24,18/03/2023,07:29,Previous thread: www.mumsnet.com/talk/_chat/47...
1,bruffin,19/03/2023,19:16,marking my spot Add message Save Share Report ...
2,MarmiteWine,19/03/2023,20:38,00:45 today Add message Save Share Report Book...
3,Drywhitefruitycidergin,20/03/2023,00:54,⏱️ I just completed PlusWord in 02:47 www.tele...
4,Drywhitefruitycidergin,20/03/2023,00:55,*thread ffs - that's why I'm so slow at pw too...
...,...,...,...,...
1053,sanityisamyth,21/03/2023,05:44,⏱️ I just completed PlusWord in 01:03 www.tele...
1054,Drywhitefruitycidergin,21/03/2023,06:24,⏱️ I just completed PlusWord in 04:04 www.tele...
1055,DadDadDad,21/03/2023,07:04,1:27 for me today. Add message Save Share Repo...
1056,Madcats,21/03/2023,09:40,It took me a while to understand the answer to...


### Converts 'Today' and 'Yesterday to date values, creates and sorts by timestamp

In [7]:
df['date'] = df['date'].str.replace('Yesterday', dt.datetime.strftime((dt.datetime.today() - dt.timedelta(days=1)), '%d/%m/%Y'))
df['date'] = df['date'].str.replace('Today', dt.datetime.strftime(dt.datetime.today(), '%d/%m/%Y'))
df['load_ts'] = df['date'] + ' ' + (df['time']+':00')
df['load_ts'] = df['load_ts'] + '.000'
df['load_ts'] = pd.to_datetime(df['load_ts'], format='%d/%m/%Y %H:%M:%S.%f')
df = df.sort_values(by=['load_ts'])

### Extracts times from text and adds 00: to allow it to handle hours

In [8]:
df['text'] =df['text'].str.extract(r'(\d*\d:\d\d)')
df = df.dropna(subset='text')
df = df.copy()
df['text'] =df['text'].str.replace(r'(^\d:\d\d)', r'0\1', regex=True)
df['text'] = '00:' + df['text']

### Drops duplicate entries for users on same date, drops columns and renames

In [9]:
df = df.copy()
df= df.drop_duplicates(subset=['user', 'date'])
df = df.drop(columns=['date', 'time'])
df = df.rename(columns={'text' : 'time'})
df = df[['load_ts', 'time', 'user']]
df

### Loads in db data

In [None]:
df_mums = data_import('Mumsnet_Times')
df_mums['load_ts'] = pd.to_datetime(df_mums['load_ts'], format='%Y-%m-%d %H:%M:%S.%f')
df_mums

### Filters out rows that are already in db

In [12]:
df = df.set_index(['load_ts', 'user'])
df_mums = df_mums.set_index(['load_ts', 'user'])
df = df[~df.index.isin(df_mums.index)].reset_index()

### Formats df

In [15]:
df['load_ts'] = df['load_ts'].astype('str')
df['load_ts'] = df['load_ts'] +'.000'
df = df[['load_ts', 'time', 'user']]
df

In [None]:
if not dataframe.empty:
    data_export(df)

Unnamed: 0,load_ts,time,user
