# Named Entity Recognition

## Import SpaCy in English

In [1]:
!python -m spacy download en_core_web_lg
import spacy
nlp = spacy.load("en_core_web_lg")

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.1/587.7 MB 1.1 MB/s eta 0:09:00
     ---------------------------------------- 0.1/587.7 MB 1.4 MB/s eta 0:06:55
     ---------------------------------------- 0.6/587.7 MB 3.7 MB/s eta 0:02:41
     --------------------------------------- 2.0/587.7 MB 10.5 MB/s eta 0:00:57
     --------------------------------------- 3.2/587.7 MB 14.7 MB/s eta 0:00:40
     --------------------------------------- 5.1/587.7 MB 18.1 MB/s eta 0:00:33
     --------------------------------------- 5.1/587.7 MB 18.1 MB/s eta 0:00:33
     --------------------------------------- 5.1/587.7 MB 18.1 MB/s eta 0:00:33
     --------------------------------------- 6.7/587.7 MB 15.3 MB/s eta 0:00:38
      ------------------------

## Let’s Try on Real Dataset1

In [2]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Change 'html5lib' to 'html.parser'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.reuters.com/site-search/?query=movies&offset=0')
article = nlp(ny_bb)
len(article.ents)

1

## Have a Look At The NERS

In [3]:
# Visualize named entities in the article using displacy
from spacy import displacy
displacy.render(article, style='ent', jupyter=True)

## Popular NER Types

In [4]:
# Count the frequency of each named entity label in the article
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'GPE': 1})

## Most Popular NER

In [5]:
# Extract the text of each named entity in the article and count the most common 5 entities
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('JS', 1)]

## Let’s Pick One Sentence to Analyze

In [6]:
# Extract sentences from the article and print the first sentence
sentences = [x for x in article.sents]
print(sentences[0])

reuters.comPlease enable JS and disable any ad blocker


## NER Tags

In [7]:
# Visualize named entities in the first sentence of the article
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

## Types of Words in Sentence

In [8]:
# Extract orthographic form, part-of-speech, and lemma of non-stopword, non-punctuation tokens in the first sentence
[(x.orth_,x.pos_, x.lemma_) for x in [y
                                      for y
                                      in nlp(str(sentences[0]))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('reuters.comPlease', 'INTJ', 'reuters.complease'),
 ('enable', 'VERB', 'enable'),
 ('JS', 'PROPN', 'JS'),
 ('disable', 'VERB', 'disable'),
 ('ad', 'NOUN', 'ad'),
 ('blocker', 'NOUN', 'blocker')]

## Sentence Dependency Tree

In [9]:
# Visualize dependency parse of the first sentence of the article with custom distance between words
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True,
                options = {'distance': 120})

## Let’s Try on Real Dataset2

In [10]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Change 'html5lib' to 'html.parser'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://timesofindia.indiatimes.com/sports/cricket/match-center-scorecard/Gujarat%20Titans-vs-Mumbai%20Indians-live-score-update-indian-premier-league-2024/ahmmi03242024237775')
article = nlp(ny_bb)
len(article.ents)

110

## Have a Look At The NERS

In [11]:
# Visualize named entities in the article using displacy
from spacy import displacy
displacy.render(article, style='ent', jupyter=True)

## Popular NER Types

In [12]:
# Count the frequency of each named entity label in the article
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 57,
         'PERSON': 17,
         'NORP': 9,
         'CARDINAL': 9,
         'GPE': 6,
         'PRODUCT': 4,
         'FAC': 2,
         'WORK_OF_ART': 2,
         'ORDINAL': 2,
         'DATE': 1,
         'MONEY': 1})

## Most Popular NER

In [13]:
# Extract the text of each named entity in the article and count the most common 5 entities
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('Mumbai', 3), ('Indians', 3), ('6', 2), ('Virat Kohli', 2), ('NewsTimes', 2)]

## Let’s Pick One Sentence to Analyze

In [14]:
# Extract sentences from the article and print the first sentence
sentences = [x for x in article.sents]
print(sentences[0])

Gujarat Titans vs Mumbai Indians Live Ball by Ball Commentary, Scorecard, News, Venue, City and Squads and moreEditionININUSRead ePaperSign InTOIcricketIPL


## NER Tags

In [15]:
# Visualize named entities in the first sentence of the article
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

## Types of Words in Sentence

In [16]:
# Extract orthographic form, part-of-speech, and lemma of non-stopword, non-punctuation tokens in the first sentence
[(x.orth_,x.pos_, x.lemma_) for x in [y
                                      for y
                                      in nlp(str(sentences[0]))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Gujarat', 'PROPN', 'Gujarat'),
 ('Titans', 'PROPN', 'Titans'),
 ('vs', 'ADP', 'vs'),
 ('Mumbai', 'PROPN', 'Mumbai'),
 ('Indians', 'PROPN', 'Indians'),
 ('Live', 'VERB', 'live'),
 ('Ball', 'PROPN', 'Ball'),
 ('Ball', 'PROPN', 'Ball'),
 ('Commentary', 'PROPN', 'Commentary'),
 ('Scorecard', 'PROPN', 'Scorecard'),
 ('News', 'PROPN', 'News'),
 ('Venue', 'PROPN', 'Venue'),
 ('City', 'PROPN', 'City'),
 ('Squads', 'PROPN', 'Squads'),
 ('moreEditionININUSRead', 'PROPN', 'moreEditionININUSRead'),
 ('ePaperSign', 'VERB', 'epapersign'),
 ('InTOIcricketIPL', 'X', 'intoicricketipl')]

## Sentence Dependency Tree

In [17]:
# Visualize dependency parse of the first sentence of the article with custom distance between words
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True,
                options = {'distance': 120})

## Importing SpaCy in Italian

In [27]:
!python -m spacy download it_core_news_lg
import spacy

# Load the French language model
nlp = spacy.load("it_core_news_lg")


Collecting it-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_lg-3.7.0/it_core_news_lg-3.7.0-py3-none-any.whl (567.9 MB)
     ---------------------------------------- 0.0/567.9 MB ? eta -:--:--
     -------------------------------------- 0.0/567.9 MB 991.0 kB/s eta 0:09:34
     -------------------------------------- 0.1/567.9 MB 980.4 kB/s eta 0:09:40
     ---------------------------------------- 0.4/567.9 MB 2.6 MB/s eta 0:03:38
     ---------------------------------------- 1.4/567.9 MB 7.6 MB/s eta 0:01:15
     --------------------------------------- 3.6/567.9 MB 15.4 MB/s eta 0:00:37
     --------------------------------------- 4.1/567.9 MB 14.5 MB/s eta 0:00:39
     --------------------------------------- 6.9/567.9 MB 21.0 MB/s eta 0:00:27
      -------------------------------------- 8.9/567.9 MB 24.8 MB/s eta 0:00:23
      ------------------------------------- 10.5/567.9 MB 34.4 MB/s eta 0:00:17
      ---------------------

## Let’s Try on Real Dataset3

In [28]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Change 'html5lib' to 'html.parser'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://it.euronews.com/')
article = nlp(ny_bb)
len(article.ents)

221

## Have a Look At The NERS

In [29]:
# Visualize named entities in the article using displacy
from spacy import displacy
displacy.render(article, style='ent', jupyter=True)

## Popular NER Types

In [30]:
# Count the frequency of each named entity label in the article
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'MISC': 153, 'LOC': 41, 'ORG': 18, 'PER': 9})

## Most Popular NER

In [31]:
# Extract the text of each named entity in the article and count the most common 5 entities
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('Prossima                                                                                 ',
  7),
 ('COLLABORAZIONE', 6),
 ('Mosca', 4),
 ('Kiev', 4),
 ('Europa', 4)]

## Let’s Pick One Sentence to Analyze

In [32]:
# Extract sentences from the article and print the first sentence
sentences = [x for x in article.sents]
print(sentences[0])

 


## NER Tags

In [33]:
# Visualize named entities in the first sentence of the article
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')



## Types of Words in Sentence

In [34]:
# Extract orthographic form, part-of-speech, and lemma of non-stopword, non-punctuation tokens in the first sentence
[(x.orth_,x.pos_, x.lemma_) for x in [y
                                      for y
                                      in nlp(str(sentences[0]))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[(' ', 'SPACE', ' ')]

## Sentence Dependency Tree

In [35]:
# Visualize dependency parse of the first sentence of the article with custom distance between words
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True,
                options = {'distance': 120})