In [1]:
import spacy

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.1/12.8 MB 1.7 MB/s eta 0:00:08
     ---------------------------------------- 0.1/12.8 MB 1.2 MB/s eta 0:00:11
      --------------------------------------- 0.2/12.8 MB 1.2 MB/s eta 0:00:11
      --------------------------------------- 0.2/12.8 MB 1.2 MB/s eta 0:00:11
      --------------------------------------- 0.3/12.8 MB 1.1 MB/s eta 0:00:11
     - -------------------------------------- 0.4/12.8 MB 1.2 MB/s eta 0:00:11
     - -------------------------------------- 0.4/12.8 MB 1.2 MB/s eta 0:00:11
     - -------------------------------------- 0.5/12.8 MB 1.2 MB/s eta 0:00:11
     - -------------------------------------- 0.5/12.8 MB 1.2 MB/s eta 0:00:11
     - ----------------------------------

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
sent = nlp('''Mark Zuckerberg will meet Aditya Badave 
    on Monday 6th June 2024 10 am for $101 Trillion deal 
    at mumbai tomorrow''')

In [5]:
# Entity Retrieval
sent.ents

(Mark Zuckerberg,
 Aditya Badave,
 Monday 6th June 2024,
 10 am,
 $101 Trillion,
 mumbai,
 tomorrow)

In [6]:
for ent in sent.ents:
    print(ent.text,'--->', ent.label_)

Mark Zuckerberg ---> PERSON
Aditya Badave ---> WORK_OF_ART
Monday 6th June 2024 ---> DATE
10 am ---> TIME
$101 Trillion ---> MONEY
mumbai ---> GPE
tomorrow ---> DATE


In [7]:
raw_text = '''
Alaska is the largest U.S. state by area, comprising more total area than the next three largest states of Texas, California and Montana combined, and is the seventh-largest subnational division in the world. It is the third-least populous and most sparsely populated U.S. state, but is, with a population of 736,081 as of 2020, the continent's most populous territory located mostly north of the 60th parallel, with more than quadruple the combined populations of Northern Canada and Greenland.[6] The state contains the second-largest and largest cities in the United States by area: the state capital of Juneau, and its former capital, Sitka, respectively. The state's most populous city is Anchorage and approximately half of Alaska's residents live within its metropolitan area.
'''

In [8]:
raw_text

"\nAlaska is the largest U.S. state by area, comprising more total area than the next three largest states of Texas, California and Montana combined, and is the seventh-largest subnational division in the world. It is the third-least populous and most sparsely populated U.S. state, but is, with a population of 736,081 as of 2020, the continent's most populous territory located mostly north of the 60th parallel, with more than quadruple the combined populations of Northern Canada and Greenland.[6] The state contains the second-largest and largest cities in the United States by area: the state capital of Juneau, and its former capital, Sitka, respectively. The state's most populous city is Anchorage and approximately half of Alaska's residents live within its metropolitan area.\n"

In [9]:
sent1 = nlp(raw_text)

In [10]:
for ent in sent1.ents:
    print(ent.text,'--->', ent.label_)

Alaska ---> GPE
U.S. ---> GPE
three ---> CARDINAL
Texas ---> GPE
California ---> GPE
Montana ---> GPE
seventh ---> ORDINAL
third ---> ORDINAL
U.S. ---> GPE
736,081 ---> CARDINAL
2020 ---> DATE
60th ---> ORDINAL
Northern Canada ---> ORG
second ---> ORDINAL
the United States ---> GPE
Juneau ---> GPE
Sitka ---> PERSON
Anchorage ---> GPE
approximately half ---> CARDINAL
Alaska ---> GPE


In [11]:
spacy.explain('GPE')

'Countries, cities, states'

In [12]:
spacy.explain('CARDINAL')

'Numerals that do not fall under another type'

In [13]:
spacy.explain('ORDINAL')

'"first", "second", etc.'

In [14]:
spacy.explain('ORG')

'Companies, agencies, institutions, etc.'

In [15]:
from spacy import displacy
displacy.render(sent,style='ent',jupyter=True)

In [16]:
from spacy import displacy
displacy.render(sent1,style='ent',jupyter=True)

In [17]:
icc_text = '''
The Chairman heads the board of directors and on June 26, 2014, Narayanaswami Srinivasan, the former president of BCCI, was announced as the first chairman of the council.[6] The role of ICC president became a largely honorary position after the establishment of the chairman role and other changes made to the ICC constitution in 2014. It has been claimed that the 2014 changes have handed control to the 'Big Three' nations of England, India and Australia.[7] The last ICC president was Zaheer Abbas,[8] who was appointed in June 2015 following the resignation of Mustafa Kamal in April 2015. When the post of ICC president was abolished in April 2016, Shashank Manohar, who replaced Srinivasan in October 2015, became the first independent elected chairman of the ICC.[9]
'''

In [18]:
icc_text

"\nThe Chairman heads the board of directors and on June 26, 2014, Narayanaswami Srinivasan, the former president of BCCI, was announced as the first chairman of the council.[6] The role of ICC president became a largely honorary position after the establishment of the chairman role and other changes made to the ICC constitution in 2014. It has been claimed that the 2014 changes have handed control to the 'Big Three' nations of England, India and Australia.[7] The last ICC president was Zaheer Abbas,[8] who was appointed in June 2015 following the resignation of Mustafa Kamal in April 2015. When the post of ICC president was abolished in April 2016, Shashank Manohar, who replaced Srinivasan in October 2015, became the first independent elected chairman of the ICC.[9]\n"

In [19]:
sent2 = nlp(icc_text)

for ent in sent2.ents:
    print(ent.text,'--->', ent.label_)

June 26, 2014 ---> DATE
Narayanaswami Srinivasan ---> PERSON
BCCI ---> ORG
first ---> ORDINAL
ICC ---> ORG
ICC ---> ORG
2014 ---> DATE
2014 ---> DATE
England ---> GPE
India ---> GPE
ICC ---> ORG
Zaheer Abbas,[8 ---> PERSON
June 2015 ---> DATE
Mustafa Kamal ---> PERSON
April 2015 ---> DATE
ICC ---> ORG
April 2016 ---> DATE
Shashank Manohar ---> PERSON
Srinivasan ---> ORG
October 2015 ---> DATE
first ---> ORDINAL
ICC.[9 ---> ORG


In [20]:
from spacy import displacy
displacy.render(sent2,style='ent',jupyter=True)

In [21]:
for ent in sent2.ents:
    if(ent.label_=='PERSON'):
        print(ent.text)

Narayanaswami Srinivasan
Zaheer Abbas,[8
Mustafa Kamal
Shashank Manohar
