# POS-tagging and NER: examples

Here we are going to show a simple example of pos-tagging and NER tasks:

*   First we will see the pos-tag using NLTK on english language only
*   Then we will see the same task using SpaCy module both for english and italian language
*   Finally we will see how implement the Name Entity Recognition task on SpaCy

# **NLTK**

## Part of Speech

In [None]:
import nltk

In [None]:
nltk.download("averaged_perceptron_tagger")
nltk.download('punkt')
nltk.download('tagsets')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [None]:
text = "Jazz is a music genre that originated in the African-American communities of New Orleans, Louisiana, in the late 19th and early 20th centuries, with its roots in blues and ragtime."

In [None]:
tokens = nltk.word_tokenize(text)
tokens

['Jazz',
 'is',
 'a',
 'music',
 'genre',
 'that',
 'originated',
 'in',
 'the',
 'African-American',
 'communities',
 'of',
 'New',
 'Orleans',
 ',',
 'Louisiana',
 ',',
 'in',
 'the',
 'late',
 '19th',
 'and',
 'early',
 '20th',
 'centuries',
 ',',
 'with',
 'its',
 'roots',
 'in',
 'blues',
 'and',
 'ragtime',
 '.']

In [None]:
tags = nltk.pos_tag(tokens)
tags

[('Jazz', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('music', 'NN'),
 ('genre', 'NN'),
 ('that', 'WDT'),
 ('originated', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('African-American', 'JJ'),
 ('communities', 'NNS'),
 ('of', 'IN'),
 ('New', 'NNP'),
 ('Orleans', 'NNP'),
 (',', ','),
 ('Louisiana', 'NNP'),
 (',', ','),
 ('in', 'IN'),
 ('the', 'DT'),
 ('late', 'JJ'),
 ('19th', 'CD'),
 ('and', 'CC'),
 ('early', 'RB'),
 ('20th', 'JJ'),
 ('centuries', 'NNS'),
 (',', ','),
 ('with', 'IN'),
 ('its', 'PRP$'),
 ('roots', 'NNS'),
 ('in', 'IN'),
 ('blues', 'NNS'),
 ('and', 'CC'),
 ('ragtime', 'NN'),
 ('.', '.')]

In [None]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [None]:
token_tags = [token+"("+ tag +")" for token, tag in tags]
token_tags = ", ".join(token_tags)
token_tags

'Jazz(NNP), is(VBZ), a(DT), music(NN), genre(NN), that(WDT), originated(VBD), in(IN), the(DT), African-American(JJ), communities(NNS), of(IN), New(NNP), Orleans(NNP), ,(,), Louisiana(NNP), ,(,), in(IN), the(DT), late(JJ), 19th(CD), and(CC), early(RB), 20th(JJ), centuries(NNS), ,(,), with(IN), its(PRP$), roots(NNS), in(IN), blues(NNS), and(CC), ragtime(NN), .(.)'

# **spaCy**

## **Inglese**

### **Part of Speech**

In [None]:
import spacy
from spacy.tokens import Span
from spacy import displacy

In [None]:
nlp_eng = spacy.load("en_core_web_sm")

In [None]:
text = "Jazz is a music genre that originated in the African-American communities of New Orleans, Louisiana, in the late 19th and early 20th centuries, with its roots in blues and ragtime."

In [None]:
doc = nlp_eng(text)

In [None]:
doc[0].tag_, doc[0].pos_

('NN', 'NOUN')

In [None]:
spacy.explain(doc[0].tag_)

'noun, singular or mass'

In [None]:
print("TOKEN\t\tTAG\t\tPOS\t\t\tDESCRIPTION\n")
for token in doc:
  print(token.text, "\t\t", token.tag_, "\t\t", token.pos_, "\t\t\t", spacy.explain(token.tag_))

TOKEN		TAG		POS			DESCRIPTION

Jazz 		 NN 		 NOUN 			 noun, singular or mass
is 		 VBZ 		 AUX 			 verb, 3rd person singular present
a 		 DT 		 DET 			 determiner
music 		 NN 		 NOUN 			 noun, singular or mass
genre 		 NN 		 NOUN 			 noun, singular or mass
that 		 WDT 		 PRON 			 wh-determiner
originated 		 VBD 		 VERB 			 verb, past tense
in 		 IN 		 ADP 			 conjunction, subordinating or preposition
the 		 DT 		 DET 			 determiner
African 		 JJ 		 ADJ 			 adjective (English), other noun-modifier (Chinese)
- 		 HYPH 		 PUNCT 			 punctuation mark, hyphen
American 		 JJ 		 ADJ 			 adjective (English), other noun-modifier (Chinese)
communities 		 NNS 		 NOUN 			 noun, plural
of 		 IN 		 ADP 			 conjunction, subordinating or preposition
New 		 NNP 		 PROPN 			 noun, proper singular
Orleans 		 NNP 		 PROPN 			 noun, proper singular
, 		 , 		 PUNCT 			 punctuation mark, comma
Louisiana 		 NNP 		 PROPN 			 noun, proper singular
, 		 , 		 PUNCT 			 punctuation mark, comma
in 		 IN 		 ADP 			 co

## **Name Entity Recognition**

In [None]:
doc.ents

(African-American, New Orleans, Louisiana, the late 19th, early 20th centuries)

In [None]:
doc.ents[0].label_, spacy.explain(doc.ents[0].label_)

('NORP', 'Nationalities or religious or political groups')

In [None]:
print("ENTITIES\t\tLABEL\t\t\t\tDESCRIPTION\n")
for ent in doc.ents:
  print(ent.text, "\t\t", ent.label_, "\t\t\t", spacy.explain(ent.label_))

ENTITIES		LABEL				DESCRIPTION

African-American 		 NORP 			 Nationalities or religious or political groups
New Orleans 		 GPE 			 Countries, cities, states
Louisiana 		 GPE 			 Countries, cities, states
the late 19th 		 DATE 			 Absolute or relative dates or periods
early 20th centuries 		 DATE 			 Absolute or relative dates or periods


In [None]:
text_ents = text
for i in range(len(doc.ents)):
  text_ents = text_ents.replace(doc.ents[i].text, doc.ents[i].text+"("+ doc.ents[i].label_ +")")
text_ents

'Jazz is a music genre that originated in the African-American(NORP) communities of New Orleans(GPE), Louisiana(GPE), in the late 19th(DATE) and early 20th centuries(DATE), with its roots in blues and ragtime.'

## **Italiano**

### **Part of Speech**

In [None]:
!python -m spacy download it_core_news_sm

Collecting it-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.7.0/it_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
nlp = spacy.load("it_core_news_sm")

In [None]:
text = "Alessandro Giuseppe Antonio Pertini, detto Sandro (Stella, 25 settembre 1896 – Roma, 24 febbraio 1990), è stato un politico, giornalista e partigiano italiano. Fu il settimo presidente della Repubblica Italiana dal 1978 al 1985, primo socialista e unico esponente del PSI a ricoprire la carica."

In [None]:
doc = nlp(text)

In [None]:
print("TOKEN\t\tTAG\t\tPOS\t\t\tDESCRIPTION\n")
for token in doc:
  print(token.text, "\t\t", token.tag_, "\t\t", token.pos_, "\t\t\t", spacy.explain(token.pos_))

TOKEN		TAG		POS			DESCRIPTION

Alessandro 		 SP 		 PROPN 			 proper noun
Giuseppe 		 SP 		 PROPN 			 proper noun
Antonio 		 SP 		 PROPN 			 proper noun
Pertini 		 SP 		 PROPN 			 proper noun
, 		 FF 		 PUNCT 			 punctuation
detto 		 V 		 VERB 			 verb
Sandro 		 SP 		 PROPN 			 proper noun
( 		 FB 		 PUNCT 			 punctuation
Stella 		 SP 		 PROPN 			 proper noun
, 		 FF 		 PUNCT 			 punctuation
25 		 N 		 NUM 			 numeral
settembre 		 S 		 NOUN 			 noun
1896 		 N 		 NUM 			 numeral
– 		 FF 		 PUNCT 			 punctuation
Roma 		 SP 		 PROPN 			 proper noun
, 		 FF 		 PUNCT 			 punctuation
24 		 N 		 NUM 			 numeral
febbraio 		 S 		 NOUN 			 noun
1990 		 N 		 NUM 			 numeral
) 		 FB 		 PUNCT 			 punctuation
, 		 FF 		 PUNCT 			 punctuation
è 		 VA 		 AUX 			 auxiliary
stato 		 V 		 AUX 			 auxiliary
un 		 RI 		 DET 			 determiner
politico 		 S 		 NOUN 			 noun
, 		 FF 		 PUNCT 			 punctuation
giornalista 		 S 		 NOUN 			 noun
e 		 CC 		 CCONJ 			 coordinating conjunction
partigiano 		 V 		 VERB 			

## **Name Entity Recognition**

In [None]:
doc.ents

(Alessandro Giuseppe Antonio Pertini,
 Sandro,
 Stella,
 Roma,
 Repubblica Italiana,
 PSI)

In [None]:
print("ENTITIES\t\tLABEL\t\t\t\tDESCRIPTION\n")
for ent in doc.ents:
  print(ent.text, "\t\t", ent.label_, "\t\t\t", spacy.explain(ent.label_))

ENTITIES		LABEL				DESCRIPTION

Alessandro Giuseppe Antonio Pertini 		 PER 			 Named person or family.
Sandro 		 LOC 			 Non-GPE locations, mountain ranges, bodies of water
Stella 		 LOC 			 Non-GPE locations, mountain ranges, bodies of water
Roma 		 LOC 			 Non-GPE locations, mountain ranges, bodies of water
Repubblica Italiana 		 LOC 			 Non-GPE locations, mountain ranges, bodies of water
PSI 		 ORG 			 Companies, agencies, institutions, etc.


In [None]:
text_ents = text
for i in range(len(doc.ents)):
  text_ents = text_ents.replace(doc.ents[i].text, doc.ents[i].text+"("+ doc.ents[i].label_ +")")
text_ents

'Alessandro Giuseppe Antonio Pertini(PER), detto Sandro(LOC) (Stella(LOC), 25 settembre 1896 – Roma(LOC), 24 febbraio 1990), è stato un politico, giornalista e partigiano italiano. Fu il settimo presidente della Repubblica Italiana(LOC) dal 1978 al 1985, primo socialista e unico esponente del PSI(ORG) a ricoprire la carica.'

In [None]:
for i, token in enumerate(doc):
  print(token,"\t",i)

Alessandro 	 0
Giuseppe 	 1
Antonio 	 2
Pertini 	 3
, 	 4
detto 	 5
Sandro 	 6
( 	 7
Stella 	 8
, 	 9
25 	 10
settembre 	 11
1896 	 12
– 	 13
Roma 	 14
, 	 15
24 	 16
febbraio 	 17
1990 	 18
) 	 19
, 	 20
è 	 21
stato 	 22
un 	 23
politico 	 24
, 	 25
giornalista 	 26
e 	 27
partigiano 	 28
italiano 	 29
. 	 30
Fu 	 31
il 	 32
settimo 	 33
presidente 	 34
della 	 35
Repubblica 	 36
Italiana 	 37
dal 	 38
1978 	 39
al 	 40
1985 	 41
, 	 42
primo 	 43
socialista 	 44
e 	 45
unico 	 46
esponente 	 47
del 	 48
PSI 	 49
a 	 50
ricoprire 	 51
la 	 52
carica 	 53
. 	 54


In [None]:
ents = list(doc.ents)

date_tag = doc.vocab.strings["DATE"]
date_1 = Span(doc, 10, 13, label= date_tag)
date_2 = Span(doc, 16, 19, label= date_tag)
date_3 = Span(doc, 39, 42, label= date_tag)


del ents[1]
person_tag = doc.vocab.strings["PER"]
person = Span(doc, 6, 7, label= person_tag)

del ents[3]
geo_pol_tag = doc.vocab.strings["GPE"]
gpe = Span(doc, 36, 38, label= geo_pol_tag)

doc.ents = ents +[date_1, date_2, date_3, person, gpe]
doc.ents

(Alessandro Giuseppe Antonio Pertini,
 Sandro,
 Stella,
 25 settembre 1896,
 Roma,
 24 febbraio 1990,
 Repubblica Italiana,
 1978 al 1985,
 PSI)

In [None]:
print("ENTITIES\t\tLABEL\t\t\t\tDESCRIPTION\n")
for ent in doc.ents:
  print(ent.text, "\t\t", ent.label_, "\t\t\t", spacy.explain(ent.label_))

ENTITIES		LABEL				DESCRIPTION

Alessandro Giuseppe Antonio Pertini 		 PER 			 Named person or family.
Sandro 		 PER 			 Named person or family.
Stella 		 LOC 			 Non-GPE locations, mountain ranges, bodies of water
25 settembre 1896 		 DATE 			 Absolute or relative dates or periods
Roma 		 LOC 			 Non-GPE locations, mountain ranges, bodies of water
24 febbraio 1990 		 DATE 			 Absolute or relative dates or periods
Repubblica Italiana 		 GPE 			 Countries, cities, states
1978 al 1985 		 DATE 			 Absolute or relative dates or periods
PSI 		 ORG 			 Companies, agencies, institutions, etc.


In [None]:
displacy.render(doc, style="ent")