# Named Entity Recognition

## Import SpaCy in English

In [1]:
!python -m spacy download en_core_web_lg
import spacy
nlp = spacy.load("en_core_web_lg")

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.1/587.7 MB 3.2 MB/s eta 0:03:04
     ---------------------------------------- 0.1/587.7 MB 3.2 MB/s eta 0:03:04
     ---------------------------------------- 0.2/587.7 MB 1.5 MB/s eta 0:06:35
     ---------------------------------------- 0.2/587.7 MB 1.5 MB/s eta 0:06:39
     ---------------------------------------- 0.3/587.7 MB 1.5 MB/s eta 0:06:42
     ---------------------------------------- 0.4/587.7 MB 1.5 MB/s eta 0:06:37
     ---------------------------------------- 0.5/587.7 MB 1.5 MB/s eta 0:06:37
     ---------------------------------------- 0.5/587.7 MB 1.5 MB/s eta 0:06:27
     ---------------------------------------- 0.5/587.7 MB 1.5 MB/s eta 0:06:27
     -------------------------

## Let’s Try on Real Dataset1

In [26]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Change 'html5lib' to 'html.parser'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.reuters.com/markets/how-companies-are-responding-attacks-ships-red-sea-2023-12-19/')
article = nlp(ny_bb)
len(article.ents)

1

## Have a Look At The NERS

In [27]:
# Visualize named entities in the article using displacy
from spacy import displacy
displacy.render(article, style='ent', jupyter=True)

## Popular NER Types

In [28]:
# Count the frequency of each named entity label in the article
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'GPE': 1})

## Most Popular NER

In [29]:
# Extract the text of each named entity in the article and count the most common 5 entities
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('JS', 1)]

## Let’s Pick One Sentence to Analyze

In [30]:
# Extract sentences from the article and print the first sentence
sentences = [x for x in article.sents]
print(sentences[0])

reuters.comPlease enable JS and disable any ad blocker


## NER Tags

In [31]:
# Visualize named entities in the first sentence of the article
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

## Types of Words in Sentence

In [32]:
# Extract orthographic form, part-of-speech, and lemma of non-stopword, non-punctuation tokens in the first sentence
[(x.orth_,x.pos_, x.lemma_) for x in [y
                                      for y
                                      in nlp(str(sentences[0]))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('reuters.comPlease', 'INTJ', 'reuters.complease'),
 ('enable', 'VERB', 'enable'),
 ('JS', 'PROPN', 'JS'),
 ('disable', 'VERB', 'disable'),
 ('ad', 'NOUN', 'ad'),
 ('blocker', 'NOUN', 'blocker')]

## Sentence Dependency Tree

In [33]:
# Visualize dependency parse of the first sentence of the article with custom distance between words
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True,
                options = {'distance': 120})

## Let’s Try on Real Dataset2

In [10]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Change 'html5lib' to 'html.parser'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://timesofindia.indiatimes.com/sports/cricket/match-center-scorecard/Gujarat%20Titans-vs-Mumbai%20Indians-live-score-update-indian-premier-league-2024/ahmmi03242024237775')
article = nlp(ny_bb)
len(article.ents)

87

## Have a Look At The NERS

In [11]:
# Visualize named entities in the article using displacy
from spacy import displacy
displacy.render(article, style='ent', jupyter=True)

## Popular NER Types

In [12]:
# Count the frequency of each named entity label in the article
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 38,
         'GPE': 14,
         'NORP': 6,
         'PRODUCT': 2,
         'DATE': 4,
         'CARDINAL': 7,
         'PERSON': 12,
         'WORK_OF_ART': 2,
         'EVENT': 1,
         'MONEY': 1})

## Most Popular NER

In [13]:
# Extract the text of each named entity in the article and count the most common 5 entities
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('Mumbai', 3), ('Indians', 3), ('2024', 3), ('Watan', 2), ('NewsTimes', 2)]

## Let’s Pick One Sentence to Analyze

In [14]:
# Extract sentences from the article and print the first sentence
sentences = [x for x in article.sents]
print(sentences[0])

Gujarat Titans vs Mumbai Indians Live Ball by Ball Commentary, Scorecard, News, Venue, City and Squads and moreEditionININUSRead ePaperSign InTOIcricketIPL


## NER Tags

In [15]:
# Visualize named entities in the first sentence of the article
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

## Types of Words in Sentence

In [16]:
# Extract orthographic form, part-of-speech, and lemma of non-stopword, non-punctuation tokens in the first sentence
[(x.orth_,x.pos_, x.lemma_) for x in [y
                                      for y
                                      in nlp(str(sentences[0]))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Gujarat', 'PROPN', 'Gujarat'),
 ('Titans', 'PROPN', 'Titans'),
 ('vs', 'ADP', 'vs'),
 ('Mumbai', 'PROPN', 'Mumbai'),
 ('Indians', 'PROPN', 'Indians'),
 ('Live', 'VERB', 'live'),
 ('Ball', 'PROPN', 'Ball'),
 ('Ball', 'PROPN', 'Ball'),
 ('Commentary', 'PROPN', 'Commentary'),
 ('Scorecard', 'PROPN', 'Scorecard'),
 ('News', 'PROPN', 'News'),
 ('Venue', 'PROPN', 'Venue'),
 ('City', 'PROPN', 'City'),
 ('Squads', 'PROPN', 'Squads'),
 ('moreEditionININUSRead', 'PROPN', 'moreEditionININUSRead'),
 ('ePaperSign', 'VERB', 'epapersign'),
 ('InTOIcricketIPL', 'X', 'intoicricketipl')]

## Sentence Dependency Tree

In [17]:
# Visualize dependency parse of the first sentence of the article with custom distance between words
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True,
                options = {'distance': 120})

## Importing SpaCy in German(Dutch)

In [None]:
!python -m spacy download de_core_news_lg
import spacy
# Load the German language model
nlp = spacy.load("de_core_news_lg")

## Let’s Try on Real Dataset3

In [18]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Change 'html5lib' to 'html.parser'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.sueddeutsche.de/sport/kroos-dfb-frankreich-nagelsmann-deutschland-1.6484717?reduced=true')
article = nlp(ny_bb)
len(article.ents)

138

## Have a Look At The NERS

In [19]:
# Visualize named entities in the article using displacy
from spacy import displacy
displacy.render(article, style='ent', jupyter=True)

## Popular NER Types

In [20]:
# Count the frequency of each named entity label in the article
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'GPE': 6,
         'CARDINAL': 10,
         'ORG': 39,
         'PERSON': 60,
         'DATE': 2,
         'PRODUCT': 3,
         'NORP': 1,
         'MONEY': 1,
         'FAC': 6,
         'PERCENT': 9,
         'LOC': 1})

## Most Popular NER

In [21]:
# Extract the text of each named entity in the article and count the most common 5 entities
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('Gutschein', 3),
 ('2:0', 2),
 ('Toni Kroos', 2),
 ('gegen', 2),
 ('bei eBay', 2)]

## Let’s Pick One Sentence to Analyze

In [22]:
# Extract sentences from the article and print the first sentence
sentences = [x for x in article.sents]
print(sentences[0])

Deutschland und das 2:0


## NER Tags

In [23]:
# Visualize named entities in the first sentence of the article
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

## Types of Words in Sentence

In [24]:
# Extract orthographic form, part-of-speech, and lemma of non-stopword, non-punctuation tokens in the first sentence
[(x.orth_,x.pos_, x.lemma_) for x in [y
                                      for y
                                      in nlp(str(sentences[0]))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Deutschland', 'PROPN', 'Deutschland'),
 ('und', 'VERB', 'und'),
 ('das', 'VERB', 'da'),
 ('2:0', 'NUM', '2:0')]

## Sentence Dependency Tree

In [25]:
# Visualize dependency parse of the first sentence of the article with custom distance between words
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True,
                options = {'distance': 120})