# Information Extraction and NER (Named Entity Recognition)

In [1]:
import spacy

spacy.__version__

'3.7.5'

> [Spacy models available](https://spacy.io/models/en)

In [2]:
import en_core_web_sm

nlp = en_core_web_sm.load() # Spacy model, creates a pipeline with a number components

# Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.

In [4]:
doc = nlp(
    'Nintendo Co Ltd 7974.T said on Thursday third-quarter operating profit rose 6%,driven by Switch console sales in the year-end shopping season, but the earnings fell below market expectations.Profit for the October-December quarter was 168.7 billion yen ($1.54 billion) versus 158.6 billion yen a year earlier.That compared with an average forecast of 175 billion yen from 10 analyst estimates compiled by Refinitiv.'
)

## NER

In [5]:
ner = [(X.text, X.label_) for X in doc.ents] # traversing all the entities

ner

[('Nintendo Co Ltd', 'ORG'),
 ('Thursday third-quarter', 'DATE'),
 ('6%,driven', 'CARDINAL'),
 ('Switch', 'ORG'),
 ('the year-end shopping season', 'DATE'),
 ('October-December quarter', 'DATE'),
 ('168.7 billion yen', 'MONEY'),
 ('$1.54 billion', 'MONEY'),
 ('158.6 billion yen', 'MONEY'),
 ('a year earlier', 'DATE'),
 ('175 billion yen', 'MONEY'),
 ('10', 'CARDINAL'),
 ('Refinitiv', 'ORG')]

> Extracting IOB of an Entity

> The IOB format (Inside-Outside-Beginning) is a standard format used for representing named entities in a sentence.

    B-: Beginning of an entity (e.g., B-PER for "Beginning of a Person entity").

    I-: Inside of an entity (e.g., I-PER for "Inside of a Person entity").

    O: Outside of an entity (not part of any named entity).

In [6]:
tag = [(X, X.ent_iob_, X.ent_type_) for X in doc]

tag

[(Nintendo, 'B', 'ORG'),
 (Co, 'I', 'ORG'),
 (Ltd, 'I', 'ORG'),
 (7974.T, 'O', ''),
 (said, 'O', ''),
 (on, 'O', ''),
 (Thursday, 'B', 'DATE'),
 (third, 'I', 'DATE'),
 (-, 'I', 'DATE'),
 (quarter, 'I', 'DATE'),
 (operating, 'O', ''),
 (profit, 'O', ''),
 (rose, 'O', ''),
 (6%,driven, 'B', 'CARDINAL'),
 (by, 'O', ''),
 (Switch, 'B', 'ORG'),
 (console, 'O', ''),
 (sales, 'O', ''),
 (in, 'O', ''),
 (the, 'B', 'DATE'),
 (year, 'I', 'DATE'),
 (-, 'I', 'DATE'),
 (end, 'I', 'DATE'),
 (shopping, 'I', 'DATE'),
 (season, 'I', 'DATE'),
 (,, 'O', ''),
 (but, 'O', ''),
 (the, 'O', ''),
 (earnings, 'O', ''),
 (fell, 'O', ''),
 (below, 'O', ''),
 (market, 'O', ''),
 (expectations, 'O', ''),
 (., 'O', ''),
 (Profit, 'O', ''),
 (for, 'O', ''),
 (the, 'O', ''),
 (October, 'B', 'DATE'),
 (-, 'I', 'DATE'),
 (December, 'I', 'DATE'),
 (quarter, 'I', 'DATE'),
 (was, 'O', ''),
 (168.7, 'B', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (yen, 'I', 'MONEY'),
 ((, 'O', ''),
 ($, 'B', 'MONEY'),
 (1.54, 'I', 'MONEY'),
 (bi

In [7]:
# Some more examples

doc1 = nlp('Labour is a centre-left political party in the United Kingdom that has been described as an alliance of social democrats, democratic socialists and trade unionists. In all general elections since 1922, Labour has been either the governing party or the Official Opposition.')

ner = [(X.text, X.label_) for X in doc1.ents]

ner

[('Labour', 'ORG'),
 ('the United Kingdom', 'GPE'),
 ('democrats', 'NORP'),
 ('democratic', 'NORP'),
 ('socialists', 'NORP'),
 ('1922', 'DATE'),
 ('Labour', 'ORG'),
 ('the Official Opposition', 'ORG')]

### Fetching text from Internet (say a news article)

In [8]:
import requests

def url_to_string(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)

        # Raise an error if the request was unsuccessful
        response.raise_for_status()

        # Return the raw HTML content as a string
        return response.text

    except requests.exceptions.RequestException as e:
        # Handle any exceptions (like connection errors)
        print(f"Error fetching the URL: {e}")
        return None

In [9]:
text = url_to_string('https://www.bbc.com/news/articles/cx24gze60yzo')

text



In [11]:
from bs4 import BeautifulSoup
import requests
import re

def html_text(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')

    for script in soup(["script", "style", 'aside']):
        script.extract()

    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

html_text('https://www.bbc.com/news/articles/cx24gze60yzo')

'Here\'s who is in Trump cabinet and other top staff positionsSkip to contentBritish Broadcasting CorporationWatch LiveHomeNewsSportBusinessInnovationCultureArtsTravelEarthAudioVideoLiveHomeNewsIsrael-Gaza WarWar in UkraineUS & CanadaUKUK PoliticsEnglandN. IrelandN. Ireland PoliticsScotlandScotland PoliticsWalesWales PoliticsAfricaAsiaChinaIndiaAustraliaEuropeLatin AmericaMiddle EastIn PicturesBBC InDepthBBC VerifySportBusinessExecutive LoungeTechnology of BusinessFuture of BusinessInnovationTechnologyScience & HealthArtificial IntelligenceAI v the MindCultureFilm & TVMusicArt & DesignStyleBooksEntertainment NewsArtsArts in MotionTravelDestinationsAfricaAntarcticaAsiaAustralia and PacificCaribbean & BermudaCentral AmericaEuropeMiddle EastNorth AmericaSouth AmericaWorld’s TableCulture & ExperiencesAdventuresThe SpeciaListEarthNatural WondersWeather & ScienceClimate SolutionsSustainable BusinessGreen LivingAudioPodcastsRadioAudio FAQsVideoLiveLive NewsLive SportHomeNewsSportBusinessInnov

In [12]:
text = html_text('https://www.bbc.com/news/articles/cx24gze60yzo')

text

'Here\'s who is in Trump cabinet and other top staff positionsSkip to contentBritish Broadcasting CorporationWatch LiveHomeNewsSportBusinessInnovationCultureArtsTravelEarthAudioVideoLiveHomeNewsIsrael-Gaza WarWar in UkraineUS & CanadaUKUK PoliticsEnglandN. IrelandN. Ireland PoliticsScotlandScotland PoliticsWalesWales PoliticsAfricaAsiaChinaIndiaAustraliaEuropeLatin AmericaMiddle EastIn PicturesBBC InDepthBBC VerifySportBusinessExecutive LoungeTechnology of BusinessFuture of BusinessInnovationTechnologyScience & HealthArtificial IntelligenceAI v the MindCultureFilm & TVMusicArt & DesignStyleBooksEntertainment NewsArtsArts in MotionTravelDestinationsAfricaAntarcticaAsiaAustralia and PacificCaribbean & BermudaCentral AmericaEuropeMiddle EastNorth AmericaSouth AmericaWorld’s TableCulture & ExperiencesAdventuresThe SpeciaListEarthNatural WondersWeather & ScienceClimate SolutionsSustainable BusinessGreen LivingAudioPodcastsRadioAudio FAQsVideoLiveLive NewsLive SportHomeNewsSportBusinessInnov

## Using spacy pipeline for processing

In [13]:
article = nlp(text)

In [14]:
len(article.ents)

240

In [17]:
labels = [x.label_ for x in article.ents]

set(labels)

{'CARDINAL',
 'DATE',
 'GPE',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERSON',
 'TIME'}

Let's count the labels in the article

In [18]:
from collections import Counter

Counter(labels)

Counter({'ORG': 106,
         'GPE': 31,
         'PERSON': 56,
         'DATE': 22,
         'TIME': 2,
         'ORDINAL': 5,
         'NORP': 7,
         'LOC': 3,
         'MONEY': 5,
         'CARDINAL': 3})

Most common words

In [20]:
items = [x.text for x in article.ents]

Counter(items).most_common(3)

[('Trump', 27), ('US', 9), ('first', 5)]

In [24]:
sentences = [x for x in article.sents]

print(sentences[0], sentences[20], sep="\n\n")

Here's who is in Trump cabinet and other top staff positionsSkip to contentBritish Broadcasting CorporationWatch LiveHomeNewsSportBusinessInnovationCultureArtsTravelEarthAudioVideoLiveHomeNewsIsrael-Gaza WarWar in UkraineUS & CanadaUKUK PoliticsEnglandN. IrelandN. Ireland PoliticsScotlandScotland PoliticsWalesWales PoliticsAfricaAsiaChinaIndiaAustraliaEuropeLatin AmericaMiddle EastIn PicturesBBC InDepthBBC VerifySportBusinessExecutive

"Pam was a prosecutor for nearly 20 years, tough on violent criminals, and made Florida's streets safe," he wrote.


Let's visualize

In [27]:
from spacy import displacy # Visualize spaCy’s guess at the syntactic structure of a sentence. Arrows point from children to heads, and are labelled by their relation type.

displacy.render(doc, style="ent", jupyter=True)

In [28]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [29]:
displacy.render(doc1, jupyter=True, style='ent')

In [33]:
displacy.render(doc1, style='dep', jupyter = True, options = {'distance': 120})