# Named Entity Recognition

## Installation Of Spacy Package

In [2]:
!pip install SpaCy



## Import SpaCy in English

In [2]:
!python -m spacy download en_core_web_lg
import spacy
nlp = spacy.load("en_core_web_lg")

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     -------------------------------------- 0.0/587.7 MB 187.9 kB/s eta 0:52:08
     -------------------------------------- 0.0/587.7 MB 187.9 kB/s eta 0:52:08
     -------------------------------------- 0.1/587.7 MB 252.2 kB/s eta 0:38:50
     -------------------------------------- 0.1/587.7 MB 280.5 kB/s eta 0:34:55
     -------------------------------------- 0.1/587.7 MB 280.5 kB/s eta 0:34:55
     -------------------------------------- 0.2/587.7 MB 4

## Let’s Try It Out on a Small Text

In [4]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for token in doc:
    print(token, end= '|')

My|best|friend|Ryan|Peters|likes|fancy|adventure|games|.|

## What Are The Attributes that SpaCyadds?

In [6]:
import pandas as pd
def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i, 'text': t.text, 'lemma_': t.lemma_,'is_stop': t.is_stop, 'is_alpha': t.is_alpha,'pos_': t.pos_, 'dep_': t.dep_,'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
            df = pd.DataFrame(rows).set_index('token')
            df.index.name = None
            return df

## Displaying Output

In [7]:
display_nlp(doc)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O


## Removing Stop Words Using Spacy

In [8]:
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)
non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)

[Dear, Ryan, need, sit, talk, Regards, Pete]


## Finding All Nouns Using Spacy

In [9]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)

[friend, Ryan, Peters, adventure, games]


## Named Entity Recognition

In [10]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(Ryan Peters, PERSON) 

## Trying Out on Harder One

In [11]:
text = "James O'Neill, chairman of World Cargo Inc, lives in SanFrancisco."
doc = nlp(text)
for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(James O'Neill, PERSON) (World Cargo Inc, ORG) (SanFrancisco, ORG) 

## Visualize NERS

In [12]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [16]:
!pip install html5lib

Collecting html5lib
  Obtaining dependency information for html5lib from https://files.pythonhosted.org/packages/6c/dd/a834df6482147d48e225a49515aabc28974ad5a4ca3215c18a882565b028/html5lib-1.1-py2.py3-none-any.whl.metadata
  Downloading html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
   ---------------------------------------- 0.0/112.2 kB ? eta -:--:--
   ---------------------------------------- 0.0/112.2 kB ? eta -:--:--
   --- ------------------------------------ 10.2/112.2 kB ? eta -:--:--
   --- ------------------------------------ 10.2/112.2 kB ? eta -:--:--
   -------------- ------------------------ 41.0/112.2 kB 326.8 kB/s eta 0:00:01
   ---------------------------------- --- 102.4/112.2 kB 587.0 kB/s eta 0:00:01
   -------------------------------------- 112.2/112.2 kB 592.6 kB/s eta 0:00:00
Installing collected packages: html5lib
Successfully installed html5lib-1.1


## Let’s Try on Real Dataset

In [25]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Change 'html5lib' to 'html.parser'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.reuters.com/world/europe/ukrainian-infrastructure-pounded-again-saturday-2022-10-22/')
article = nlp(ny_bb)
len(article.ents)

1

## Have a Look At The NERS

In [26]:
displacy.render(article, style='ent', jupyter=True)

## Popular NER Types

In [27]:
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 1})

## Most Popular NER

In [28]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('JS', 1)]

## Let’s Pick One Sentence to Analyze

In [30]:
sentences = [x for x in article.sents]
print(sentences[0])

reuters.comPlease enable JS and disable any ad blocker


## NER Tags

In [31]:
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

## Types of words in the sentence

In [32]:
[(x.orth_,x.pos_, x.lemma_) for x in [y
                                      for y
                                      in nlp(str(sentences[0]))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('reuters.comPlease', 'NOUN', 'reuters.complease'),
 ('enable', 'VERB', 'enable'),
 ('JS', 'NOUN', 'js'),
 ('disable', 'VERB', 'disable'),
 ('ad', 'NOUN', 'ad'),
 ('blocker', 'NOUN', 'blocker')]

## Sentence Dependency Tree

In [33]:
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True,
                options = {'distance': 120})