In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import spacy
from spacy.matcher import Matcher
import json
import os
import re
import warnings
warnings.filterwarnings("ignore")


In [2]:
nlp = spacy.load("en_core_web_sm") # nlp language model
matcher = Matcher(nlp.vocab) # nlp language matcher

In [3]:
# primary url
url = "https://www.theguardian.com/business/2022/jun/06/thousands-workers-worlds-biggest-trial-four-day-week?"

In [5]:
# define header
header = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN) AppleWebKit/533+ (KHTML, like Gecko)'}

In [6]:
# Collect primary article html and relevant url links.
r = requests.get(url, headers=header)
r.raise_for_status()
r

<Response [200]>

In [7]:
# make soup from the html
soup = BeautifulSoup(r.content, features='html')

In [8]:
# find all tags whose name starts (^) and ends ($) with letter "p".
tags_w_p_list = soup.find_all(re.compile("^p$"))
len(tags_w_p_list)

15

In [9]:
# create a single string from the article, and create found source metadata

article_str = ""
urls_found = [] # list of dicts
for i,p in enumerate(tags_w_p_list):
    if p.a:
        source_dict = {'line': i, 'text': p.text.strip(), 'url': p.a.get('href')}
        # TODO: include metadata (original article url, date accessed, etc.)
        urls_found.append(source_dict)
    article_str += p.text.strip()
    print(f"{len(article_str)=}", end='\r')

len(article_str)=3478

In [10]:
# Analyse text using Spacy by making a doc
doc = nlp(article_str)

In [15]:
entity_explanations = []
for ent in doc.ents:
    d = {'entity': ent, 'label': ent.label_, 'info': spacy.explain(ent.label_)}
    entity_explanations.append(d)

In [16]:
entity_explanations[:2]

[{'entity': life’More than 3,300,
  'label': 'CARDINAL',
  'info': 'Numerals that do not fall under another type'},
 {'entity': 70,
  'label': 'CARDINAL',
  'info': 'Numerals that do not fall under another type'}]

In [18]:
# %%timeit
entity_explanations = {}
for ent in doc.ents:
    if str(ent.label_) not in entity_explanations:
        entity_explanations[str(ent.label_)] = {
            'explanation': spacy.explain(ent.label_), 
            'entities': [str(ent)]
        }
    else:
        entity_explanations[str(ent.label_)]['entities'].append(str(ent))
print(len(entity_explanations))

9


In [23]:
with open('spacy_entities.json', 'w') as op_file:
    json.dump(json.load(entity_explanations), op_file, indent=2)
op_file.close()

AttributeError: 'dict' object has no attribute 'read'

In [24]:
entity_explanations

{'CARDINAL': {'explanation': 'Numerals that do not fall under another type',
  'entities': [life’More than 3,300, 70, 100:80:100]},
 'GPE': {'explanation': 'Countries, cities, states',
  'entities': [UK, Sheffield, London, Tonbridge, UK, Spain, Scotland, UK]},
 'DATE': {'explanation': 'Absolute or relative dates or periods',
  'entities': [four-day week,
   Monday,
   six months,
   4 Day Week,
   4 Day Week,
   four-day week,
   four-day week,
   later this year,
   an extra day,
   four-day week,
   25,
   four-day week,
   four-day week,
   20th-century,
   five-day,
   21st-century,
   four-day week]},
 'ORG': {'explanation': 'Companies, agencies, institutions, etc.',
  'entities': [Cambridge University,
   Oxford University and Boston College,
   Wells-next-the-Sea,
   Norfolk,
   Stellar Asset Management,
   Charity Bank,
   Kent,
   Boston College,
   Platten’s Fish and Chips,
   Charity Bank,
   Charity Bank]},
 'PERCENT': {'explanation': 'Percentage, including "%"',
  'entitie

In [14]:
tokens = [w.text for w in doc]
print(f"{len(tokens)=}")
print(f"{len(set(tokens))=}")

len(tokens)=707
len(set(tokens))=324


In [15]:
# without punctuation
tokens = [w for w in doc if not w.is_punct]
print(f"{len(tokens)=}")
print(f"{len(set(tokens))=}")

len(tokens)=602
len(set(tokens))=602


In [16]:
# remove stopwords, (optional) [Note: using tokens from above, not doc]
tokens = [w.text for w in tokens if not w.is_stop]
print(f"{len(tokens)=}")
print(f"{len(set(tokens))=}")

len(tokens)=320
len(set(tokens))=231


In [138]:
# Lemmitize
lemma = [w.lemma_ for w in doc]
print(f"{len(lemma)=}")
print(f"{len(set(lemma))=}")

len(lemma)=707
len(set(lemma))=299


In [164]:
def clean_text(document, text=True):
    "Returns generator of text (default: True) or spacy.tokens.token.Token that is not a punctuation, quote or stopword."
    if not isinstance(document, spacy.tokens.doc.Doc):
        document = nlp(document)

    for w in document:
        if not (w.is_stop |  w.is_punct | w.is_quote):
            yield w.text if text == True else w

In [162]:
clean_tokens = list(clean_text(article_str, False))
len(clean_tokens)

320

In [156]:
lemmas = [w.lemma_ for w in clean_tokens]
len(lemmas)

320

In [165]:
# Show visually named entities
spacy.displacy.render(doc, style='ent', jupyter=True)

### Find all organisations

In [198]:
# for ent in doc.ents:
#     d = {'entity': ent, 'label': ent.label_, 'explanation': spacy.explain(ent.label_)}
#     print(d)

In [17]:
organisations = set([ent.text.strip() for ent in doc.ents if ent.label_ == 'ORG'])
print(len(organisations))
print(organisations)

9
{'Oxford University and Boston College', 'Cambridge University', 'Stellar Asset Management', 'Boston College', 'Norfolk', 'Charity Bank', 'Platten’s Fish and Chips', 'Kent', 'Wells-next-the-Sea'}


In [213]:
ent_target = ['ORG','PERSON']

results = {}
for ent in doc.ents:
    if ent.label_ in ent_target:
        if ent.label_ not in results:
            results[ent.label_] = [ent]
        else:
            results[ent.label_].append(ent)

In [217]:
for k in ent_target:
    results[k] = list(set(results[k]))

In [223]:
",".join(results['ORG'])

TypeError: sequence item 0: expected str instance, spacy.tokens.span.Span found

In [207]:
doc.text.find('Norfolk')

804

In [183]:
matcher = Matcher(nlp.vocab) # initialize the matcher
# make pattern
pattern = [{'POS': 'PROPN', 'OP': '+'},
        {'POS': 'CCONJ', 'OP': '?'},
        {'POS': 'PROPN', 'OP': '*'},
        {'ORTH': '\'', 'OP': '?'},
        {'ORTH': '\'s', 'OP': '?'},
        {'ORTH': '(', 'OP': '+'}]  
# add pattern
matcher.add("ORG", [pattern]) # add pattern
matches = matcher(doc) # find for pattern
# show output
for match_id, start, end in matches:
    span = doc[start:end]
    print(start, end, span.text)