In [15]:
import spacy
from spacy import displacy
from collections import Counter

import en_core_web_sm
nlp = en_core_web_sm.load()

In [16]:
from bs4 import BeautifulSoup
import requests
import re

In [17]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [29]:
bbc_news = url_to_string('https://www.bbc.com/news/technology-53191007')
article = nlp(bbc_news)
len(article.ents)

143

In [30]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 57,
         'LOC': 8,
         'GPE': 20,
         'PERSON': 18,
         'DATE': 21,
         'WORK_OF_ART': 2,
         'CARDINAL': 7,
         'LAW': 1,
         'PRODUCT': 5,
         'NORP': 2,
         'FAC': 1,
         'ORDINAL': 1})

In [31]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Apple', 13), ('BBC', 5), ('Mac', 5)]

In [32]:
sentences = [x for x in article.ents]
print(sentences[70])

Carolina Milanesi of Creative Strategies


In [33]:
displacy.render(nlp(str(sentences[70])), jupyter=True, style='ent')

In [34]:
displacy.render(nlp(str(sentences[70])), style='dep', jupyter = True, options = {'distance': 120})

In [35]:
[(x.orth_,x.pos_, x.lemma_) for x in [y for y in nlp(str(sentences[70]))if not y.is_stop and y.pos_ != 'PUNCT']]

[('Carolina', 'PROPN', 'Carolina'),
 ('Milanesi', 'PROPN', 'Milanesi'),
 ('Creative', 'PROPN', 'Creative'),
 ('Strategies', 'NOUN', 'strategy')]

In [36]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[70]])

[(Carolina, 'B', 'ORG'), (Milanesi, 'I', 'ORG'), (of, 'I', 'ORG'), (Creative, 'I', 'ORG'), (Strategies, 'I', 'ORG')]


In [37]:
displacy.render(article, jupyter=True, style='ent')