In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m87.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import spacy
from spacy.matcher import Matcher

In [3]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [4]:
# 1. Create Doc object from owlcreek.txt
with open("owlcreek.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [5]:
doc = nlp(text)

In [6]:
# 2. Number of tokens
print("Total tokens:", len(doc))

Total tokens: 4835


In [7]:
# 3. Number of sentences
sents = list(doc.sents)
print("Total sentences:", len(sents))

Total sentences: 204


In [8]:
# 4. Print second sentence
second_sentence = sents[1]
print("\nSecond sentence:\n", second_sentence)


Second sentence:
 The man's hands were behind
his back, the wrists bound with a cord.  


In [9]:
# 5. Token details from second sentence
print("\nToken  POS  DEP  LEMMA")
for token in second_sentence:
    print(token.text, token.pos_, token.dep_, token.lemma_)


Token  POS  DEP  LEMMA
The DET det the
man NOUN poss man
's PART case 's
hands NOUN nsubj hand
were AUX ROOT be
behind ADP prep behind

 SPACE dep 

his PRON poss his
back NOUN pobj back
, PUNCT punct ,
the DET det the
wrists NOUN appos wrist
bound VERB acl bind
with ADP prep with
a DET det a
cord NOUN pobj cord
. PUNCT punct .
  SPACE dep  


In [10]:
# 6. Matcher: find "swimming vigorously"
matcher = Matcher(nlp.vocab)
pattern = [
    {"LOWER": "swimming"},
    {"LOWER": "vigorously"}
]
matcher.add("Swimming", [pattern])

matches = matcher(doc)

In [11]:
# 7. Print text around each match
print("\nMATCHES FOUND:")
for match_id, start, end in matches:
    span = doc[start-3 : end+3]   # 3 tokens before & after
    print(span.text)


MATCHES FOUND:
