# Spacy
Exploring the capabilities of Spacy. 

**Note:** *If files are not included in the PATH then overwrite.*

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# Install: pip install spacy && python -m spacy download en
import spacy
from spacy import displacy
import spacy

# Loading Model
print("=== Loading Model")
nlp = spacy.load('en')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')


print("=== Token")
for token in doc:
    print(token.text)

print("=== POS")
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

print("=== Entity")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

    
print("=== Word Vector")
tokens = nlp(u'dog cat banana')
for token1 in tokens:
    for token2 in tokens:
        print(token1.similarity(token2))

print("=== Out of Vocabulary")
nlp = spacy.load('en_core_web_lg')
tokens = nlp(u'dog cat banana sasquatch')
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)


=== Loading Model
=== Token
Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion
=== POS
Apple apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. u.k. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False
=== Entity
Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY
=== Word Vector
1.0
0.53907
0.28761
0.53907
1.0
0.487521
0.28761
0.487521
1.0
=== Out of Vocabulary
dog True 7.03367 False
cat True 6.68082 False
banana True 6.70001 False
sasquatch True 6.979 False


In [2]:

text = """But Google is starting from behind. The company made a late push
into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa
software, which runs on its Echo and Dot devices, have clear leads in
consumer adoption."""

doc = nlp(text)


In [6]:
from spacy.tokens import Doc

class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab
        
    def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)
    
# Add token
nlp = spacy.load('en')
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)


In [22]:
from spacy.matcher import PhraseMatcher
from spacy.matcher import Matcher

# Load default vocabulary
nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)

#=== Event
# Event based
EVENT = nlp.vocab.strings['EVENT']

def add_event_ent(matcher, doc, i, matches):
    # Append entity to doc's entity
    match_id, start, end = matches[i]
    doc.ents += ((EVENT, start, end),)

matcher.add('TheLastJedi', add_event_ent, [{'ORTH': 'The'}, {'ORTH': 'Last'}, {'ORTH': 'Jedi'}])

#=== Teminology
# Supply Gaming matches
terminology_franchise = ['Star Wars', 'Battlefront']
terminology_gamer = ['Pay-to-win']

# Game S
patterns_franchise = [nlp(text) for text in terminology_franchise]
patterns_gamer = [nlp(text) for text in terminology_gamer]

matcher.add('TerminologyFranchise', None, *patterns_franchise)
matcher.add('TerminologyGamer', None, *patterns_gamer)

doc = nlp(u"I think Disney should hit EA for Star Wars Battlefront II. "
          u"It's a totally Pay-to-win scheme. This is going to definitely affect the launch ",
         u" of The Last Jedi. I'm still gonna watch the last Jedi.")
matches = matcher(doc)
print(matches)

AttributeError: 'spacy.tokens.token.Token' object has no attribute 'items'

In [17]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)

# Get the ID of the 'EVENT' entity type. This is required to set an entity.
EVENT = nlp.vocab.strings['EVENT']

def add_event_ent(matcher, doc, i, matches):
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    doc.ents += ((EVENT, start, end),)

matcher.add('GoogleIO', add_event_ent,
            [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}],
            [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}, {'IS_DIGIT': True}])


In [None]:
from spacy.lang.en import English
from spacy.matcher import Matcher

nlp = English() # we only want the tokenizer, so no need to load a model
matcher = Matcher(nlp.vocab)

pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # positive emoji
neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji

# add patterns to match one or more emoji tokens
pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
neg_patterns = [[{'ORTH': emoji}] for emoji in neg_emoji]

matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern
matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern

# add pattern to merge valid hashtag, i.e. '#' plus any ASCII token
matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}])
