In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
content = "Eight Indian Navy veterans, who initially faced death sentences in Qatar, were released by Doha on 12th Feb 2024. The capital punishment was later converted to prolonged prison terms through diplomatic interventions by New Delhi."

doc = nlp(content)

for ent in doc.ents:
    print(ent.text,  ent.label_)

Eight CARDINAL
Indian Navy ORG
Qatar GPE
Doha GPE
12th Feb 2024 DATE
New Delhi GPE


In [None]:
from spacy import displacy
displacy.render(doc, style="ent")

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
content = "Eight Indian Navy veterans, who initially faced death sentences in Qatar, were released by Doha on 12th Feb 2024. The capital punishment was later converted to prolonged prison terms through diplomatic interventions by New Delhi."




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
sent = nltk.word_tokenize(content)

sent

['Eight',
 'Indian',
 'Navy',
 'veterans',
 ',',
 'who',
 'initially',
 'faced',
 'death',
 'sentences',
 'in',
 'Qatar',
 ',',
 'were',
 'released',
 'by',
 'Doha',
 'on',
 '12th',
 'Feb',
 '2024',
 '.',
 'The',
 'capital',
 'punishment',
 'was',
 'later',
 'converted',
 'to',
 'prolonged',
 'prison',
 'terms',
 'through',
 'diplomatic',
 'interventions',
 'by',
 'New',
 'Delhi',
 '.']

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
sent1=nltk.pos_tag(sent)
sent1

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('Eight', 'CD'),
 ('Indian', 'JJ'),
 ('Navy', 'NNP'),
 ('veterans', 'NNS'),
 (',', ','),
 ('who', 'WP'),
 ('initially', 'RB'),
 ('faced', 'VBD'),
 ('death', 'NN'),
 ('sentences', 'NNS'),
 ('in', 'IN'),
 ('Qatar', 'NNP'),
 (',', ','),
 ('were', 'VBD'),
 ('released', 'VBN'),
 ('by', 'IN'),
 ('Doha', 'NNP'),
 ('on', 'IN'),
 ('12th', 'CD'),
 ('Feb', 'JJ'),
 ('2024', 'CD'),
 ('.', '.'),
 ('The', 'DT'),
 ('capital', 'NN'),
 ('punishment', 'NN'),
 ('was', 'VBD'),
 ('later', 'RB'),
 ('converted', 'VBN'),
 ('to', 'TO'),
 ('prolonged', 'JJ'),
 ('prison', 'NN'),
 ('terms', 'NNS'),
 ('through', 'IN'),
 ('diplomatic', 'JJ'),
 ('interventions', 'NNS'),
 ('by', 'IN'),
 ('New', 'NNP'),
 ('Delhi', 'NNP'),
 ('.', '.')]

In [None]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent1)
print(cs)

(S
  Eight/CD
  Indian/JJ
  Navy/NNP
  veterans/NNS
  ,/,
  who/WP
  initially/RB
  faced/VBD
  (NP death/NN)
  sentences/NNS
  in/IN
  Qatar/NNP
  ,/,
  were/VBD
  released/VBN
  by/IN
  Doha/NNP
  on/IN
  12th/CD
  Feb/JJ
  2024/CD
  ./.
  (NP The/DT capital/NN)
  (NP punishment/NN)
  was/VBD
  later/RB
  converted/VBN
  to/TO
  (NP prolonged/JJ prison/NN)
  terms/NNS
  through/IN
  diplomatic/JJ
  interventions/NNS
  by/IN
  New/NNP
  Delhi/NNP
  ./.)


In [None]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('Eight', 'CD', 'O'),
 ('Indian', 'JJ', 'O'),
 ('Navy', 'NNP', 'O'),
 ('veterans', 'NNS', 'O'),
 (',', ',', 'O'),
 ('who', 'WP', 'O'),
 ('initially', 'RB', 'O'),
 ('faced', 'VBD', 'O'),
 ('death', 'NN', 'B-NP'),
 ('sentences', 'NNS', 'O'),
 ('in', 'IN', 'O'),
 ('Qatar', 'NNP', 'O'),
 (',', ',', 'O'),
 ('were', 'VBD', 'O'),
 ('released', 'VBN', 'O'),
 ('by', 'IN', 'O'),
 ('Doha', 'NNP', 'O'),
 ('on', 'IN', 'O'),
 ('12th', 'CD', 'O'),
 ('Feb', 'JJ', 'O'),
 ('2024', 'CD', 'O'),
 ('.', '.', 'O'),
 ('The', 'DT', 'B-NP'),
 ('capital', 'NN', 'I-NP'),
 ('punishment', 'NN', 'B-NP'),
 ('was', 'VBD', 'O'),
 ('later', 'RB', 'O'),
 ('converted', 'VBN', 'O'),
 ('to', 'TO', 'O'),
 ('prolonged', 'JJ', 'B-NP'),
 ('prison', 'NN', 'I-NP'),
 ('terms', 'NNS', 'O'),
 ('through', 'IN', 'O'),
 ('diplomatic', 'JJ', 'O'),
 ('interventions', 'NNS', 'O'),
 ('by', 'IN', 'O'),
 ('New', 'NNP', 'O'),
 ('Delhi', 'NNP', 'O'),
 ('.', '.', 'O')]


In [None]:
import nltk
nltk.download('words')
import nltk
nltk.download('maxent_ne_chunker')
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(content)))
print(ne_tree)

(S
  Eight/CD
  (GPE Indian/JJ)
  Navy/NNP
  veterans/NNS
  ,/,
  who/WP
  initially/RB
  faced/VBD
  death/NN
  sentences/NNS
  in/IN
  (GPE Qatar/NNP)
  ,/,
  were/VBD
  released/VBN
  by/IN
  (PERSON Doha/NNP)
  on/IN
  12th/CD
  Feb/JJ
  2024/CD
  ./.
  The/DT
  capital/NN
  punishment/NN
  was/VBD
  later/RB
  converted/VBN
  to/TO
  prolonged/JJ
  prison/NN
  terms/NNS
  through/IN
  diplomatic/JJ
  interventions/NNS
  by/IN
  (ORGANIZATION New/NNP Delhi/NNP)
  ./.)


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


In [None]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [None]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [None]:
sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [None]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [None]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [None]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [None]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [None]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [None]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

0

In [None]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter()

In [None]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[]