1. Create a Doc object from the file peterrabbit.txt


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

# Read the contents of the file
with open("/content/peterrabbit.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Create a Doc object
doc = nlp(text)

# Optionally print first few tokens or sentences to confirm
print(doc[:10])  # First 10 tokens


The Tale of Peter Rabbit, by Beatrix Potter (


2. For every token in the third sentence, print the token text, the POS tag, the fine-grained TAG tag, and the description of the fine-grained tag.


In [None]:
# Get list of sentences
sentences = list(doc.sents)

# Get the third sentence (index 2)
third_sentence = sentences[2]

# Print token details
print("Token Text\tPOS\t\tTAG\t\tDescription")
print("-" * 60)
for token in third_sentence:
    print(f"{token.text:<12}\t{token.pos_:<10}\t{token.tag_:<8}\t{spacy.explain(token.tag_)}")

Token Text	POS		TAG		Description
------------------------------------------------------------
They        	PRON      	PRP     	pronoun, personal
lived       	VERB      	VBD     	verb, past tense
with        	ADP       	IN      	conjunction, subordinating or preposition
their       	PRON      	PRP$    	pronoun, possessive
Mother      	PROPN     	NNP     	noun, proper singular
in          	ADP       	IN      	conjunction, subordinating or preposition
a           	DET       	DT      	determiner
sand        	NOUN      	NN      	noun, singular or mass
-           	PUNCT     	HYPH    	punctuation mark, hyphen
bank        	NOUN      	NN      	noun, singular or mass
,           	PUNCT     	,       	punctuation mark, comma
underneath  	ADP       	IN      	conjunction, subordinating or preposition
the         	DET       	DT      	determiner
root        	NOUN      	NN      	noun, singular or mass
of          	ADP       	IN      	conjunction, subordinating or preposition
a           	DET       	DT

3. Provide a frequency list of POS tags from the entire document

In [None]:
from collections import Counter
# Count POS tags
pos_counts = Counter(token.pos_ for token in doc)

# Print frequency list
print("POS Tag\t\tCount")
print("-" * 25)
for pos, count in pos_counts.most_common():
    print(f"{pos:<10}\t{count}")



POS Tag		Count
-------------------------
NOUN      	173
PUNCT     	172
VERB      	131
ADP       	124
PRON      	108
SPACE     	99
DET       	90
PROPN     	75
ADV       	65
CCONJ     	61
ADJ       	54
AUX       	50
PART      	28
SCONJ     	20
NUM       	8


4. CHALLENGE: What percentage of tokens are nouns?

In [9]:
num_nouns = sum(1 for token in doc if token.pos_ == "NOUN")
total_tokens = len(doc)
percentage_nouns = (num_nouns / total_tokens) * 100

print(f"Percentage of tokens that are nouns: {percentage_nouns:.2f}%")

Percentage of tokens that are nouns: 13.75%


5. Display the Dependency Parse for the third sentence.

In [8]:
import spacy.displacy as displacy
displacy.render(third_sentence, style='dep', jupyter=True)


 6. Show the first two named entities from Beatrix Potter's The Tale of Peter Rabbit

In [10]:
named_entities = list(doc.ents)
print("First two named entities:")
for entity in named_entities[:2]:
    print(f"Text: {entity.text}, Label: {entity.label_}")

First two named entities:
Text: The Tale of Peter Rabbit, Label: WORK_OF_ART
Text: Beatrix Potter, Label: PERSON


7. How many sentences are contained in The Tale of Peter Rabbit?

In [11]:
# Count the number of sentences
num_sentences = len(list(doc.sents))
print(f"Number of sentences: {num_sentences}")

Number of sentences: 57


8. CHALLENGE: How many sentences contain named entities?

In [12]:
# Count sentences that contain named entities
sentences_with_entities = sum(1 for sent in doc.sents if any(ent for ent in sent.ents))
print(f"Number of sentences containing named entities: {sentences_with_entities}")

Number of sentences containing named entities: 38


9. Display the named entity visualization for list_of_sents[0] from the previous problem

In [13]:
first_sentence = list(doc.sents)[0]
displacy.render(first_sentence, style='ent', jupyter=True)