# Name Entity Recognition

## Install Spacy package

In [13]:
!pip install spacy
!pip install html5lib

Collecting html5lib
  Obtaining dependency information for html5lib from https://files.pythonhosted.org/packages/6c/dd/a834df6482147d48e225a49515aabc28974ad5a4ca3215c18a882565b028/html5lib-1.1-py2.py3-none-any.whl.metadata
  Downloading html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
   ---------------------------------------- 0.0/112.2 kB ? eta -:--:--
   --- ------------------------------------ 10.2/112.2 kB ? eta -:--:--
   ----------------------------- ---------- 81.9/112.2 kB 1.5 MB/s eta 0:00:01
   ---------------------------------------- 112.2/112.2 kB 1.3 MB/s eta 0:00:00
Installing collected packages: html5lib
Successfully installed html5lib-1.1


## Import SpaCy in English


In [14]:
# run the next line only once if needed
!python -m spacy download en_core_web_lg
import spacy
nlp = spacy.load("en_core_web_lg")

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     -------------------------------------- 0.0/587.7 MB 991.0 kB/s eta 0:09:54
     ---------------------------------------- 0.3/587.7 MB 3.4 MB/s eta 0:02:53
     ---------------------------------------- 1.2/587.7 MB 9.4 MB/s eta 0:01:03
     --------------------------------------- 2.2/587.7 MB 12.9 MB/s eta 0:00:46
     --------------------------------------- 3.4/587.7 MB 15.4 MB/s eta 0:00:38
     --------------------------------------- 4.8/587.7 MB 17.1 MB/s eta 0:00:35
     --------------------------------------- 6.1/587.7 MB 18.7 MB/s eta 0:00:32
      -------------------------------------- 7.7/587.7 MB 20.4 MB/s eta 0:00:29
      -------------------------------------- 9.0/587.7 MB 21.4 MB/s eta 0:00:28
      ------------------------

## Let’s try it out on a small text

In [15]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for token in doc:
    print(token, end=" | ")

My | best | friend | Ryan | Peters | likes | fancy | adventure | games | . | 

## attributes that SpaCy adds

In [16]:
import pandas as pd
def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i, 'text': t.text, 'lemma_': t.lemma_,
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_,'ent_type_': t.ent_type_, 
                   'ent_iob_': t.ent_iob_}
            rows.append(row)
            df = pd.DataFrame(rows).set_index('token')
            df.index.name = None
            return df


In [17]:
        display_nlp(doc)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O


## Removing Stop words using Spacy

In [18]:
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)
non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)

[Dear, Ryan, need, sit, talk, Regards, Pete]


## Find all nouns using Spacy

In [19]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)

[friend, Ryan, Peters, adventure, games]


## Named Entity Recognition

In [20]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(Ryan Peters, PERSON) 

In [21]:
text = "James O'Neill, chairman of World Cargo Inc, lives in SanFrancisco."
doc = nlp(text)
for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(James O'Neill, PERSON) (World Cargo Inc, ORG) (SanFrancisco, ORG) 

## Visualize NERS

In [22]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

# Using on a Dataset

In [28]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Change 'html5lib' to 'html.parser'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.reuters.com/world/europe/ukrainian-infrastructure-pounded-again-saturday-2022-10-22/')
article = nlp(ny_bb)
len(article.ents)

1

## looking at the NERS

In [29]:
displacy.render(article, style='ent', jupyter=True)

## Popular NER types

In [32]:
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'GPE': 1})

In [33]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('JS', 1)]

In [36]:
sentences = [x for x in article.sents]
print(sentences[0])

reuters.comPlease enable JS and disable any ad blocker


## NER Tags

In [38]:
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

## Types of words in the sentence

In [39]:
[(x.orth_,x.pos_, x.lemma_) for x in [y
                                      for y
                                      in nlp(str(sentences[0]))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('reuters.comPlease', 'INTJ', 'reuters.complease'),
 ('enable', 'VERB', 'enable'),
 ('JS', 'PROPN', 'JS'),
 ('disable', 'VERB', 'disable'),
 ('ad', 'NOUN', 'ad'),
 ('blocker', 'NOUN', 'blocker')]

## Sentence dependency tree

In [41]:
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True,
                options = {'distance': 120})