In [1]:
import spacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

# Read the content of the file
with open('owlcreek.txt', 'r') as file:
    text = file.read()

# Create a Doc object
doc = nlp(text)

In [2]:
# Count the number of tokens
num_tokens = len(doc)
print(f'Number of tokens: {num_tokens}')

Number of tokens: 4835


In [3]:
# Count the number of sentences
num_sentences = len(list(doc.sents))
print(f'Number of sentences: {num_sentences}')

Number of sentences: 204


In [4]:
# Print the second sentence
second_sentence = list(doc.sents)[1]
print(f'Second sentence: {second_sentence.text}')

Second sentence: The man's hands were behind
his back, the wrists bound with a cord.  


In [5]:
# Print details for each token in the second sentence
for token in second_sentence:
    print(f'Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Lemma: {token.lemma_}')

Text: The, POS: DET, Dep: det, Lemma: the
Text: man, POS: NOUN, Dep: poss, Lemma: man
Text: 's, POS: PART, Dep: case, Lemma: 's
Text: hands, POS: NOUN, Dep: nsubj, Lemma: hand
Text: were, POS: AUX, Dep: ROOT, Lemma: be
Text: behind, POS: ADP, Dep: prep, Lemma: behind
Text: 
, POS: SPACE, Dep: dep, Lemma: 

Text: his, POS: PRON, Dep: poss, Lemma: his
Text: back, POS: NOUN, Dep: pobj, Lemma: back
Text: ,, POS: PUNCT, Dep: punct, Lemma: ,
Text: the, POS: DET, Dep: det, Lemma: the
Text: wrists, POS: NOUN, Dep: appos, Lemma: wrist
Text: bound, POS: VERB, Dep: acl, Lemma: bind
Text: with, POS: ADP, Dep: prep, Lemma: with
Text: a, POS: DET, Dep: det, Lemma: a
Text: cord, POS: NOUN, Dep: pobj, Lemma: cord
Text: ., POS: PUNCT, Dep: punct, Lemma: .
Text:  , POS: SPACE, Dep: dep, Lemma:  


In [6]:
from spacy.matcher import Matcher

# Initialize the matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Define the pattern for "swimming vigorously"
pattern = [{"LOWER": "swimming"}, {"LOWER": "vigorously"}]
matcher.add("Swimming", [pattern])

# Find matches in the doc
matches = matcher(doc)

In [10]:
# Print surrounding text for each match found
for match_id, start, end in matches:
    matched_span = doc[start:end]
    # Get surrounding context (e.g., 10 characters before and after)
    start_context = max(start - 10, 0)
    end_context = min(end + 10, len(doc))

    surrounding_text = doc[start_context:end_context].text
    print(f'Match found: "{matched_span.text}"')
    print(f'Surrounding text: "{surrounding_text}"\n')