In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp(u'The quick brown fox jumped over the lazy dog\'s back')

In [6]:
#get content of document
doc.text

"The quick brown fox jumped over the lazy dog's back"

In [7]:
#grab particular token by index
print(f"{doc[4].text} {doc[4].pos_} {doc[4].tag_}")

jumped jumped VERB VBD


In [13]:
# get details of each token in doc
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")

The        DET        DT         determiner
quick      ADJ        JJ         adjective (English), other noun-modifier (Chinese)
brown      ADJ        JJ         adjective (English), other noun-modifier (Chinese)
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective (English), other noun-modifier (Chinese)
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass


In [17]:
#same token can have different meanings

doc = nlp(u"I read books on NLP.")
doc_2 = nlp(u'I read a book on NLP.')

token = doc[1]
token_2 = doc_2[1]

print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")
print(f"{token_2.text:{10}} {token_2.pos_:{10}} {token_2.tag_:{10}} {spacy.explain(token_2.tag_):{10}}")

read       VERB       VBP        verb, non-3rd person singular present
read       VERB       VBD        verb, past tense


In [18]:
#get parts of speech count as a dictionary object
doc = nlp(u'The quick brown fox jumped over the lazy dog\'s back')
POS_counts = doc.count_by(spacy.attrs.POS)
print(POS_counts)

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1}


In [25]:
#check text for numerical identifiers
doc.vocab[100].text

'VERB'

In [26]:
doc[2].pos

84

In [31]:
# create frequency list of pos 
for k,v in sorted(POS_counts.items()):
    print(f"{k:{5}}. {doc.vocab[k].text:{5}} {v:{2}}")

   84. ADJ    3
   85. ADP    1
   90. DET    2
   92. NOUN   3
   94. PART   1
  100. VERB   1


In [32]:
#syntactic dependency

DEP_counts = doc.count_by(spacy.attrs.DEP)
print(DEP_counts)
for k,v in sorted(DEP_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v:{2}}")

{415: 2, 402: 3, 429: 1, 8206900633647566924: 1, 443: 1, 439: 1, 8110129090154140942: 1, 400: 1}
400. advmod  1
402. amod   3
415. det    2
429. nsubj  1
439. pobj   1
443. prep   1
8110129090154140942. case   1
8206900633647566924. ROOT   1
