# 1. NLTK

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
ex = "A giant industry with annual revenues of nearly $3trn has grown to provide transport to the masses. Over 1bn cars heave passengers along the world’s roads. There were many pioneers beside the Germans. The French provided words like coupé, chauffeur and cabriolet."


In [3]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [4]:
sent = preprocess(ex)
sent

[('A', 'DT'),
 ('giant', 'JJ'),
 ('industry', 'NN'),
 ('with', 'IN'),
 ('annual', 'JJ'),
 ('revenues', 'NNS'),
 ('of', 'IN'),
 ('nearly', 'RB'),
 ('$', '$'),
 ('3trn', 'CD'),
 ('has', 'VBZ'),
 ('grown', 'VBN'),
 ('to', 'TO'),
 ('provide', 'VB'),
 ('transport', 'NN'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('masses', 'NNS'),
 ('.', '.'),
 ('Over', 'IN'),
 ('1bn', 'CD'),
 ('cars', 'NNS'),
 ('heave', 'VBP'),
 ('passengers', 'NNS'),
 ('along', 'IN'),
 ('the', 'DT'),
 ('world', 'NN'),
 ('’', 'NNP'),
 ('s', 'NN'),
 ('roads', 'NNS'),
 ('.', '.'),
 ('There', 'EX'),
 ('were', 'VBD'),
 ('many', 'JJ'),
 ('pioneers', 'NNS'),
 ('beside', 'VBP'),
 ('the', 'DT'),
 ('Germans', 'NNPS'),
 ('.', '.'),
 ('The', 'DT'),
 ('French', 'JJ'),
 ('provided', 'VBD'),
 ('words', 'NNS'),
 ('like', 'IN'),
 ('coupé', 'NN'),
 (',', ','),
 ('chauffeur', 'NN'),
 ('and', 'CC'),
 ('cabriolet', 'NN'),
 ('.', '.')]

CC: It is the conjunction of coordinating

CD: It is a digit of cardinal

DT: It is the determiner

EX: Existential

FW: It is a foreign word

IN: Preposition and conjunction

JJ: Adjective

JJR and JJS: Adjective and superlative

LS: List marker

MD: Modal

NN: Singular noun

NNS, NNP, NNPS: Proper and plural noun

PDT: Predeterminer

WRB: Adverb of wh

WP$: Possessive wh

WP: Pronoun of wh

WDT: Determiner of wp

VBZ: Verb

VBP, VBN, VBG, VBD, VB: Forms of verbs

UH: Interjection

TO: To go

RP: Particle

RBS, RB, RBR: Adverb

PRP, PRP$: Pronoun personal and professional

In [9]:
# to "chunk" we come up with a pattern.  
# In this case, every noun phrase NP should be formed when an optional determiner DT 
# followed by any adjectivesJJ then a noun NN
pattern = 'NP: {<DT>?<JJ>*<NN|NNS>}'

In [11]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)
# some punctuation issues but overall good

(S
  (NP A/DT giant/JJ industry/NN)
  with/IN
  (NP annual/JJ revenues/NNS)
  of/IN
  nearly/RB
  $/$
  3trn/CD
  has/VBZ
  grown/VBN
  to/TO
  provide/VB
  (NP transport/NN)
  to/TO
  (NP the/DT masses/NNS)
  ./.
  Over/IN
  1bn/CD
  (NP cars/NNS)
  heave/VBP
  (NP passengers/NNS)
  along/IN
  (NP the/DT world/NN)
  ’/NNP
  (NP s/NN)
  (NP roads/NNS)
  ./.
  There/EX
  were/VBD
  (NP many/JJ pioneers/NNS)
  beside/VBP
  the/DT
  Germans/NNPS
  ./.
  The/DT
  French/JJ
  provided/VBD
  (NP words/NNS)
  like/IN
  (NP coupé/NN)
  ,/,
  (NP chauffeur/NN)
  and/CC
  (NP cabriolet/NN)
  ./.)


In [12]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('A', 'DT', 'B-NP'),
 ('giant', 'JJ', 'I-NP'),
 ('industry', 'NN', 'I-NP'),
 ('with', 'IN', 'O'),
 ('annual', 'JJ', 'B-NP'),
 ('revenues', 'NNS', 'I-NP'),
 ('of', 'IN', 'O'),
 ('nearly', 'RB', 'O'),
 ('$', '$', 'O'),
 ('3trn', 'CD', 'O'),
 ('has', 'VBZ', 'O'),
 ('grown', 'VBN', 'O'),
 ('to', 'TO', 'O'),
 ('provide', 'VB', 'O'),
 ('transport', 'NN', 'B-NP'),
 ('to', 'TO', 'O'),
 ('the', 'DT', 'B-NP'),
 ('masses', 'NNS', 'I-NP'),
 ('.', '.', 'O'),
 ('Over', 'IN', 'O'),
 ('1bn', 'CD', 'O'),
 ('cars', 'NNS', 'B-NP'),
 ('heave', 'VBP', 'O'),
 ('passengers', 'NNS', 'B-NP'),
 ('along', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('world', 'NN', 'I-NP'),
 ('’', 'NNP', 'O'),
 ('s', 'NN', 'B-NP'),
 ('roads', 'NNS', 'B-NP'),
 ('.', '.', 'O'),
 ('There', 'EX', 'O'),
 ('were', 'VBD', 'O'),
 ('many', 'JJ', 'B-NP'),
 ('pioneers', 'NNS', 'I-NP'),
 ('beside', 'VBP', 'O'),
 ('the', 'DT', 'O'),
 ('Germans', 'NNPS', 'O'),
 ('.', '.', 'O'),
 ('The', 'DT', 'O'),
 ('French', 'JJ', 'O'),
 ('provided', 'VBD', 'O'),


# SpaCy

In [15]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [17]:
doc = nlp(ex)
pprint([(x.text, x.label_) for x in doc.ents])

[('annual', 'DATE'),
 ('nearly $3trn', 'MONEY'),
 ('1bn', 'ORDINAL'),
 ('Germans', 'NORP'),
 ('French', 'NORP')]


In [18]:
pprint([(x, x.ent_iob_, x.ent_type_) for x in doc])

[(A, 'O', ''),
 (giant, 'O', ''),
 (industry, 'O', ''),
 (with, 'O', ''),
 (annual, 'B', 'DATE'),
 (revenues, 'O', ''),
 (of, 'O', ''),
 (nearly, 'B', 'MONEY'),
 ($, 'I', 'MONEY'),
 (3trn, 'I', 'MONEY'),
 (has, 'O', ''),
 (grown, 'O', ''),
 (to, 'O', ''),
 (provide, 'O', ''),
 (transport, 'O', ''),
 (to, 'O', ''),
 (the, 'O', ''),
 (masses, 'O', ''),
 (., 'O', ''),
 (Over, 'O', ''),
 (1bn, 'B', 'ORDINAL'),
 (cars, 'O', ''),
 (heave, 'O', ''),
 (passengers, 'O', ''),
 (along, 'O', ''),
 (the, 'O', ''),
 (world, 'O', ''),
 (’s, 'O', ''),
 (roads, 'O', ''),
 (., 'O', ''),
 (There, 'O', ''),
 (were, 'O', ''),
 (many, 'O', ''),
 (pioneers, 'O', ''),
 (beside, 'O', ''),
 (the, 'O', ''),
 (Germans, 'B', 'NORP'),
 (., 'O', ''),
 (The, 'O', ''),
 (French, 'B', 'NORP'),
 (provided, 'O', ''),
 (words, 'O', ''),
 (like, 'O', ''),
 (coupé, 'O', ''),
 (,, 'O', ''),
 (chauffeur, 'O', ''),
 (and, 'O', ''),
 (cabriolet, 'O', ''),
 (., 'O', '')]


In [21]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.economist.com/special-report/2023/04/14/all-change')
article = nlp(ny_bb)
len(article.ents)

129

In [24]:
labels = [x.label_ for x in article.ents]
Counter(labels).most_common()

[('ORG', 28),
 ('DATE', 28),
 ('GPE', 23),
 ('NORP', 11),
 ('PERSON', 9),
 ('CARDINAL', 8),
 ('LOC', 6),
 ('ORDINAL', 5),
 ('PERCENT', 4),
 ('WORK_OF_ART', 3),
 ('MONEY', 2),
 ('PRODUCT', 1),
 ('QUANTITY', 1)]

In [26]:
items = [x.text for x in article.ents]
Counter(items).most_common()
# I've got some work to do on the webscraping

[('China', 12),
 ('Chinese', 6),
 ('America', 5),
 ('Tesla', 5),
 ('2022', 3),
 ('EV', 3),
 ('annual', 2),
 ('1bn', 2),
 ('Japan', 2),
 ('Europe', 2),
 ('second', 2),
 ('BYD', 2),
 ('Xpeng', 2),
 ('decades', 2),
 ('The EconomistThe EconomistThe EconomistSkip', 1),
 ('UkraineRecession watchUS politicsClimate', 1),
 ('QuarterlyThe World', 1),
 ('SpecialsMoreNewslettersPodcastsFilmsSubscriber', 1),
 ('eventsThe Economist appOnline', 1),
 ('InvitationCurrent', 1),
 ('UkraineClimate', 1),
 ('QuarterlyThe World AheadBriefingEssaySchools briefBusiness & economicsFinance &',
  1),
 ('Mac indexA-Z', 1),
 ('economicsEconomic & financial indicatorsCulture & society1843', 1),
 ('magazineCultureObituaryThe Economist', 1),
 ('EconomistSaved', 1),
 ('Simon Wright Apr', 1),
 ('14th 2023ShareGoing', 1),
 ('first', 1),
 ('The Benz Patent Motorwagen', 1),
 ('German', 1),
 ('1886', 1),
 ('10-15km', 1),
 ('nearly $3trn', 1),
 ('Germans', 1),
 ('French', 1),
 ('Ford', 1),
 ('1908', 1),
 ('the 1950s', 1),
 (

In [28]:
sentences = [x for x in article.sents]
print(sentences[20])

The next phase of the industry’s history will be one in which tech-centric firms and the Chinese come to the fore.


In [33]:
displacy.render(nlp(str(sentences[17])), jupyter = True, style = 'ent')

In [36]:
displacy.render(nlp(str(sentences[17])), style='dep', jupyter = True, options = {'distance': 90})


In [38]:
displacy.render(nlp(str(article)), jupyter = True, style = 'ent')

In [39]:
# It missed quite a lot.  