In [1]:
# prepare for imports
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jordi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Imports
import spacy
from spacy import displacy
from spacy.tokens import Token

from scipy import spatial

from nltk.chunk import conlltags2tree
from nltk.corpus import wordnet as wn

In [7]:
# Define example texts
HELLO_WORLD_TEXT = "Hello world!"
APPLE_ORANGES_TEXT = "These are apples. These are oranges."
TEST_TEXT = "This is an example text."

In [8]:
# Load model
nlp = spacy.load('en')

In [9]:
# Load text in model
# Note that "TEST_TEXT" can be exchanged for any of the above example texts
doc = nlp(TEST_TEXT)

In [10]:
# defined method for debugging purposes, prints tokenized definition of a single NLP token
def print_tokenized_definition(token):
    print(f"Token text: '{token.text}', index: '{token.idx}', lemma: '{token.lemma_}', is punctuation: '{token.is_punct}', is space: '{token.is_space}', shape: '{token.shape_}', position: '{token.pos_}', tag: '{token.tag}'")

# Print definition of all words in text
for token in doc:
    print_tokenized_definition(token)

Token text: 'This', index: '0', lemma: 'this', is punctuation: 'False', is space: 'False', shape: 'Xxxx', position: 'DET', tag: '15267657372422890137'
Token text: 'is', index: '5', lemma: 'be', is punctuation: 'False', is space: 'False', shape: 'xx', position: 'AUX', tag: '13927759927860985106'
Token text: 'an', index: '8', lemma: 'an', is punctuation: 'False', is space: 'False', shape: 'xx', position: 'DET', tag: '15267657372422890137'
Token text: 'example', index: '11', lemma: 'example', is punctuation: 'False', is space: 'False', shape: 'xxxx', position: 'NOUN', tag: '15308085513773655218'
Token text: 'text', index: '19', lemma: 'text', is punctuation: 'False', is space: 'False', shape: 'xxxx', position: 'NOUN', tag: '15308085513773655218'
Token text: '.', index: '23', lemma: '.', is punctuation: 'True', is space: 'False', shape: '.', position: 'PUNCT', tag: '12646065887601541794'


In [6]:
# Print sentence detection
for sent in doc.sents:
    print(sent)

This piece of text is an example which was written while writing this Python program.
I'm a student who is attempting to implement Natural Language Processing while following along with several tutorials.


In [7]:
# Print entities
for ent in doc.ents:
    print(f"ent text: '{ent.text}', ent label: '{ent.label_}'")

ent text: 'Python', ent label: 'ORG'


In [8]:
# Print inside outside beginning
iob_tagged = [
    (
        token.text,
        token.tag_,
        "{0}-{1}".format(token.ent_iob_, token.ent_type_)
        if token.ent_iob_ != 'O' else token.ent_iob_
    ) for token in doc
]

print(iob_tagged)
print(conlltags2tree(iob_tagged))

[('This', 'DT', 'O'), ('piece', 'NN', 'O'), ('of', 'IN', 'O'), ('text', 'NN', 'O'), ('is', 'VBZ', 'O'), ('an', 'DT', 'O'), ('example', 'NN', 'O'), ('which', 'WDT', 'O'), ('was', 'VBD', 'O'), ('written', 'VBN', 'O'), ('while', 'IN', 'O'), ('writing', 'VBG', 'O'), ('this', 'DT', 'O'), ('Python', 'NNP', 'B-ORG'), ('program', 'NN', 'O'), ('.', '.', 'O'), ('I', 'PRP', 'O'), ("'m", 'VBP', 'O'), ('a', 'DT', 'O'), ('student', 'NN', 'O'), ('who', 'WP', 'O'), ('is', 'VBZ', 'O'), ('attempting', 'VBG', 'O'), ('to', 'TO', 'O'), ('implement', 'VB', 'O'), ('Natural', 'NNP', 'O'), ('Language', 'NNP', 'O'), ('Processing', 'NNP', 'O'), ('while', 'IN', 'O'), ('following', 'VBG', 'O'), ('along', 'RB', 'O'), ('with', 'IN', 'O'), ('several', 'JJ', 'O'), ('tutorials', 'NNS', 'O'), ('.', '.', 'O')]
(S
  This/DT
  piece/NN
  of/IN
  text/NN
  is/VBZ
  an/DT
  example/NN
  which/WDT
  was/VBD
  written/VBN
  while/IN
  writing/VBG
  this/DT
  (ORG Python/NNP)
  program/NN
  ./.
  I/PRP
  'm/VBP
  a/DT
  student

In [9]:
# Create an object render which can be shown in a Jupyter Notebook
displacy.render(doc, style='ent', jupyter=True)

In [10]:
# Print noun-phrases
for chunk in doc.noun_chunks:
    print(f"Chunk text: '{chunk.text}', label: '{chunk.label_}', root text: '{chunk.root.text}'")

Chunk text: 'This piece', label: 'NP', root text: 'piece'
Chunk text: 'text', label: 'NP', root text: 'text'
Chunk text: 'an example', label: 'NP', root text: 'example'
Chunk text: 'which', label: 'NP', root text: 'which'
Chunk text: 'this Python program', label: 'NP', root text: 'program'
Chunk text: 'I', label: 'NP', root text: 'I'
Chunk text: 'a student', label: 'NP', root text: 'student'
Chunk text: 'who', label: 'NP', root text: 'who'
Chunk text: 'Natural Language Processing', label: 'NP', root text: 'Processing'
Chunk text: 'several tutorials', label: 'NP', root text: 'tutorials'


In [11]:
# Print word dependency
for token in doc:
    print(f"{token.text}/{token.tag_} <--{token.dep_}-- {token.head.text}/{token.head.tag_}")

This/DT <--det-- piece/NN
piece/NN <--nsubj-- is/VBZ
of/IN <--prep-- piece/NN
text/NN <--pobj-- of/IN
is/VBZ <--ROOT-- is/VBZ
an/DT <--det-- example/NN
example/NN <--attr-- is/VBZ
which/WDT <--nsubjpass-- written/VBN
was/VBD <--auxpass-- written/VBN
written/VBN <--relcl-- example/NN
while/IN <--mark-- writing/VBG
writing/VBG <--advcl-- written/VBN
this/DT <--det-- program/NN
Python/NNP <--compound-- program/NN
program/NN <--dobj-- writing/VBG
./. <--punct-- is/VBZ
I/PRP <--nsubj-- 'm/VBP
'm/VBP <--ROOT-- 'm/VBP
a/DT <--det-- student/NN
student/NN <--attr-- 'm/VBP
who/WP <--nsubj-- attempting/VBG
is/VBZ <--aux-- attempting/VBG
attempting/VBG <--relcl-- student/NN
to/TO <--aux-- implement/VB
implement/VB <--xcomp-- attempting/VBG
Natural/NNP <--compound-- Language/NNP
Language/NNP <--compound-- Processing/NNP
Processing/NNP <--dobj-- implement/VB
while/IN <--mark-- following/VBG
following/VBG <--advcl-- implement/VB
along/RB <--prep-- following/VBG
with/IN <--prep-- along/RB
several/JJ <

In [12]:
# Render dependancy in a better understandable overview
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [13]:
# Load larger model which contains a word vector
nlp = spacy.load('en_core_web_lg')

In [18]:
# The famous example of 'man' - 'woman' + 'queen' = 'King'

cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector
king = nlp.vocab['king'].vector

# find closest vector in the vocabualry in the result of the equation
maybe_king = man - woman + queen
computed_simularities = []

for word in nlp.vocab:
    if not word.has_vector:
        continue
        
    similarity = cosine_similarity(maybe_king, word.vector)
    computed_simularities.append((word, similarity))

computed_simularities = sorted(computed_simularities, key=lambda item: -item[1])
print([word[0].text for word in computed_simularities[:10]])

['Queen', 'QUEEN', 'queen', 'King', 'KING', 'king', 'KIng', 'Kings', 'KINGS', 'kings']


In [21]:
# computing similarity

# computation by word

banana = nlp.vocab['banana']
dog = nlp.vocab['dog']
fruit = nlp.vocab['fruit']
animal = nlp.vocab['animal']

print(f"Dog is more similar to animal than fruit: animal:{dog.similarity(animal)} > fruit:{dog.similarity(fruit)}")
print(f"Banana is more similar to fruit than animal: fruit:{banana.similarity(fruit)} > animal:{banana.similarity(animal)}")

Dog is more similar to animal than fruit: animal:0.6618534326553345 > fruit:0.23552851378917694
Banana is more similar to fruit than animal: fruit:0.6714836359024048 > animal:0.24272854626178741


In [28]:
# computation by line

target_text = "Cats are beautiful animals."
text1 = "Dogs are awesome."
text2 = "Some gorgeous creatures are felines."
text3 = "Dolphins are swimming mammals."

target = nlp(target_text)

doc1 = nlp(text1)
doc2 = nlp(text2)
doc3 = nlp(text3)

print(target_text)
print(f"{text1} {target.similarity(doc1)}")
print(f"{text2} {target.similarity(doc2)}")
print(f"{text3} {target.similarity(doc3)}")

# from this we can see that the texts have a correlation but aren't necessary relatable

Cats are beautiful animals.
Dogs are awesome. 0.8901766262114666
Some gorgeous creatures are felines. 0.9115828449161616
Dolphins are swimming mammals. 0.7822956256736615


In [29]:
print(nlp.pipeline)

[('tagger', <spacy.pipeline.pipes.Tagger object at 0x0000019CA2754948>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x0000019CA272FEE8>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x0000019CA27594C8>)]


# Custom Pipeline

In [54]:
def penn_to_wn(tag):
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None

In [46]:
# define pipeline class
class WordnetPipeline(object):
    def __init__(self, nlp):
        Token.set_extension('synset', default=None, force=True)
 
    def __call__(self, doc):
        for token in doc:
            wn_tag = penn_to_wn(token.tag_)
            if wn_tag is None:
                continue
 
            ss = wn.synsets(token.text, wn_tag)[0]
            token._.set('synset', ss)
 
        return doc

In [52]:
# add pipeline
nlp = spacy.load('en')
wn_pipeline = WordnetPipeline(nlp)
nlp.add_pipe(wn_pipeline, name='wordn_synsets')

In [55]:
# hello world text
doc = nlp(HELLO_WORLD_TEXT)
for token in doc:
    print(token.text, "-", token._.synset)

Hello - None
world - Synset('universe.n.01')
! - None


In [56]:
doc = nlp(APPLE_ORANGES_TEXT)
for token in doc:
    print(token.text, "-", token._.synset)

These - None
are - Synset('be.v.01')
apples - Synset('apple.n.01')
. - None
These - None
are - Synset('be.v.01')
oranges - Synset('orange.n.01')
. - None


In [57]:
# Test text (is too large)
doc = nlp(TEST_TEXT)
for token in doc:
    print(token.text, "-", token._.synset)

IndexError: list index out of range

In [58]:
# Tutorial example text
doc = nlp("Paris is the awesome capital of France.")
for token in doc:
    print(token.text, "-", token._.synset)

Paris - Synset('paris.n.01')
is - Synset('be.v.01')
the - None
awesome - Synset('amazing.s.02')
capital - Synset('capital.n.01')
of - None
France - Synset('france.n.01')
. - None


# Conclusion

Spacy is an Natural Language Processing framework which has lots of usages for NLP. It has methods which are useful for determining several statistics and can be expanded with new functionalities as was demonstrated with the custom pipeline.