In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [5]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [10]:
doc = nlp("I watched rain yesterday, I had tea and ice-cream.")
for token in doc:
    print(token," | ",token.pos_, " | ",spacy.explain(token.pos_))

I  |  PRON  |  pronoun
watched  |  VERB  |  verb
rain  |  NOUN  |  noun
yesterday  |  NOUN  |  noun
,  |  PUNCT  |  punctuation
I  |  PRON  |  pronoun
had  |  VERB  |  verb
tea  |  NOUN  |  noun
and  |  CCONJ  |  coordinating conjunction
ice  |  NOUN  |  noun
-  |  PUNCT  |  punctuation
cream  |  NOUN  |  noun
.  |  PUNCT  |  punctuation


In [9]:
doc = nlp("Wow! I won 50 million $ in a baking show.")
for token in doc:
    #token.tag_ : gives further categorization
    print(token," | ",token.pos_, " | ",spacy.explain(token.pos_), " | ",token.tag_, " | ",spacy.explain(token.tag_))

Wow  |  INTJ  |  interjection  |  UH  |  interjection
!  |  PUNCT  |  punctuation  |  .  |  punctuation mark, sentence closer
I  |  PRON  |  pronoun  |  PRP  |  pronoun, personal
won  |  VERB  |  verb  |  VBD  |  verb, past tense
50  |  NUM  |  numeral  |  CD  |  cardinal number
million  |  NUM  |  numeral  |  CD  |  cardinal number
$  |  SYM  |  symbol  |  $  |  symbol, currency
in  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
a  |  DET  |  determiner  |  DT  |  determiner
baking  |  NOUN  |  noun  |  NN  |  noun, singular or mass
show  |  NOUN  |  noun  |  NN  |  noun, singular or mass
.  |  PUNCT  |  punctuation  |  .  |  punctuation mark, sentence closer


In [13]:
doc = nlp("He quits the job.")
doc[1]
print(doc[1].text, " | ", doc[1].tag_," | ",spacy.explain(doc[1].tag_))

quits  |  VBZ  |  verb, 3rd person singular present


In [16]:
doc = nlp("I quit the job.")
doc[1]
print(doc[1].text, " | ", doc[1].tag_," | ",spacy.explain(doc[1].tag_))

quit  |  VBD  |  verb, past tense


In [17]:
earning_text = """
REDMOND, Wash. — July 25, 2023 — Microsoft Corp. today announced the following results for the quarter ended June 30, 2023, as compared to the corresponding period of last fiscal year:

·        Revenue was $56.2 billion and increased 8% (up 10% in constant currency)

·        Operating income was $24.3 billion and increased 18% (up 21% in constant currency)

·        Net income was $20.1 billion and increased 20% (up 23% in constant currency)

·        Diluted earnings per share was $2.69 and increased 21% (up 23% in constant currency),etc."""

In [20]:
doc = nlp(earning_text)

In [24]:
#Remove punctuation marks and extra characters
filtered_tokens = []
for token in doc:
    if token.pos_ not in ["SPACE","X","PUNCT"]:
        filtered_tokens.append(token)
        #print(token," | ",token.pos_, " | ",spacy.explain(token.pos_), " | ",token.tag_, " | ",spacy.explain(token.tag_))

In [25]:
filtered_tokens[:20]

[REDMOND,
 Wash.,
 July,
 25,
 2023,
 Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter,
 ended,
 June,
 30,
 2023,
 as]

In [34]:
#Spacy provides a convenient API called count_by()
count = doc.count_by(spacy.attrs.POS)

In [35]:
for k,v in count.items():
    print(doc.vocab[k].text, " | ",v)

SPACE  |  9
PROPN  |  6
PUNCT  |  19
NUM  |  19
NOUN  |  22
VERB  |  10
DET  |  3
ADP  |  8
SCONJ  |  1
ADJ  |  8
AUX  |  4
SYM  |  4
CCONJ  |  4
ADV  |  4
