In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
from spacy import displacy
from collections import Counter

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
# 1. Create Doc object
with open("peterrabbit.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [None]:
doc = nlp(text)
sentences = list(doc.sents)

In [None]:
# 2. Token details from 3rd sentence
third = sentences[2]
print("Third Sentence:\n", third, "\n")
print("TOKEN  POS  TAG  DESCRIPTION")
for token in third:
    print(token.text, token.pos_, token.tag_, spacy.explain(token.tag_))

Third Sentence:
 They lived with their Mother in a sand-bank, underneath the root of a
very big fir-tree.

 

TOKEN  POS  TAG  DESCRIPTION
They PRON PRP pronoun, personal
lived VERB VBD verb, past tense
with ADP IN conjunction, subordinating or preposition
their PRON PRP$ pronoun, possessive
Mother PROPN NNP noun, proper singular
in ADP IN conjunction, subordinating or preposition
a DET DT determiner
sand NOUN NN noun, singular or mass
- PUNCT HYPH punctuation mark, hyphen
bank NOUN NN noun, singular or mass
, PUNCT , punctuation mark, comma
underneath ADP IN conjunction, subordinating or preposition
the DET DT determiner
root NOUN NN noun, singular or mass
of ADP IN conjunction, subordinating or preposition
a DET DT determiner

 SPACE _SP whitespace
very ADV RB adverb
big ADJ JJ adjective (English), other noun-modifier (Chinese)
fir NOUN NN noun, singular or mass
- PUNCT HYPH punctuation mark, hyphen
tree NOUN NN noun, singular or mass
. PUNCT . punctuation mark, sentence closer


 SP

In [None]:
# 3. Frequency list of POS tags
pos_freq = Counter([token.pos_ for token in doc])
print("\nPOS Frequency:\n", pos_freq)


POS Frequency:
 Counter({'NOUN': 173, 'PUNCT': 172, 'VERB': 131, 'ADP': 124, 'PRON': 108, 'SPACE': 99, 'DET': 90, 'PROPN': 75, 'ADV': 65, 'CCONJ': 61, 'ADJ': 54, 'AUX': 50, 'PART': 28, 'SCONJ': 20, 'NUM': 8})


In [None]:
# 4. % tokens that are nouns
noun_count = sum(1 for t in doc if t.pos_ == "NOUN")
percentage = (noun_count / len(doc)) * 100
print("\nPercentage of nouns:", percentage)


Percentage of nouns: 13.751987281399048


In [None]:
# 5. Dependency parse for sentence 3
displacy.render(third, style="dep", jupyter=True)

In [None]:
# 6. First two named entities
print("\nFirst two named entities:")
print(doc.ents[0:2])


First two named entities:
(The Tale of Peter Rabbit, Beatrix Potter)


In [None]:
# 7. How many sentences?
print("\nTotal sentences:", len(sentences))


Total sentences: 57


In [None]:
# 8. How many sentences contain named entities?
count_ne = sum(1 for sent in sentences if len(sent.ents) > 0)
print("Sentences with named entities:", count_ne)

Sentences with named entities: 38


In [None]:
# 9. NER visualization for list_of_sents[0]
displacy.render(sentences[0], style="ent", jupyter=True)