In [6]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

### morphology and POS tagger

In [7]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup NOUN NN advcl xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [8]:
spacy.explain("ADP")

'adposition'

In [9]:
print("Pipeline:", nlp.pipe_names)
doc = nlp("She was reading the paper.")
token = doc[0]  # 'I'
print(token.morph)  # 'Case=Nom|Number=Sing|Person=1|PronType=Prs'
print(token.morph.get("PronType"))  # ['Prs']

Pipeline: ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs
['Prs']


In [5]:
print(doc[2].morph)  # 'Case=Nom|Person=2|PronType=Prs'
print(doc[2].pos_)  # 'PRON'; coarse grained POS tag
print(doc[2].tag)  # fine grained POS tag

Aspect=Prog|Tense=Pres|VerbForm=Part
VERB
1534113631682161808


### lemmatizer

In [6]:
lemmatizer = nlp.get_pipe("lemmatizer")
print(lemmatizer.mode)  # 'rule'

doc = nlp("I was reading the paper.")
print([token.lemma_ for token in doc])
# ['I', 'be', 'read', 'the', 'paper', '.']

rule
['I', 'be', 'read', 'the', 'paper', '.']


In [7]:
nlp = spacy.blank("sv")
# Lookup lemmatizer
nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) # see: https://github.com/explosion/spacy-lookups-data

<spacy.pipeline.lemmatizer.Lemmatizer at 0x22e446a3348>

In [8]:
nlp = spacy.blank("de")
# Morphologizer (note: model is not yet trained!)
nlp.add_pipe("morphologizer")
# Rule-based lemmatizer
nlp.add_pipe("lemmatizer", config={"mode": "rule"})

<spacy.pipeline.lemmatizer.Lemmatizer at 0x22e463ecd88>

### dependency parsing

In [17]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


In [20]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Very kind people inside CMPD medical check up. Was constantly greeted with smile and patience.")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

Very kind people people ROOT people
CMPD medical check check pobj inside
smile smile pobj with
patience patience conj smile


In [21]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

Autonomous amod cars NOUN []
cars nsubj shift VERB [Autonomous]
shift ROOT shift VERB [cars, liability, toward]
insurance compound liability NOUN []
liability dobj shift VERB [insurance]
toward prep shift VERB [manufacturers]
manufacturers pobj toward ADP []


In [23]:
displacy.render(nlp("Autonomous cars shift insurance liability toward manufacturers"), style='dep', jupyter = True, options = {'distance': 120})

In [22]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Very kind people inside CMPD medical check up. Was constantly greeted with smile and patience.")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

Very advmod kind ADJ []
kind amod people NOUN [Very]
people ROOT people NOUN [kind, inside, .]
inside prep people NOUN [check]
CMPD nmod check NOUN []
medical amod check NOUN []
check pobj inside ADP [CMPD, medical, up]
up prt check NOUN []
. punct people NOUN []
Was auxpass greeted VERB []
constantly advmod greeted VERB []
greeted ROOT greeted VERB [Was, constantly, with, .]
with prep greeted VERB [smile]
smile pobj with ADP [and, patience]
and cc smile NOUN []
patience conj smile NOUN []
. punct greeted VERB []


In [24]:
displacy.render(nlp("Very kind people inside CMPD medical check up. Was constantly greeted with smile and patience."), style='dep', jupyter = True, options = {'distance': 120})

In [29]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("4 weeks bmt, 5 weeks vocational training. Just relax. BMT will probably be the best part of your NS journey as a Pes C. After you're in your unit you'll miss kranji camp and the 4 hours per day you spend ironing bed and talking cock")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

4 nummod weeks NOUN []
weeks npadvmod bmt NOUN [4]
bmt dep training NOUN [weeks]
, punct training NOUN []
5 nummod weeks NOUN []
weeks npadvmod vocational ADJ [5]
vocational amod training NOUN [weeks]
training ROOT training NOUN [bmt, ,, vocational, .]
. punct training NOUN []
Just advmod relax VERB []
relax ROOT relax VERB [Just, .]
. punct relax VERB []
BMT nsubj be VERB []
will aux be VERB []
probably advmod be VERB []
be ccomp miss VERB [BMT, will, probably, part, 're]
the det part NOUN []
best amod part NOUN []
part attr be VERB [the, best, of]
of prep part NOUN [journey]
your poss journey NOUN []
NS compound journey NOUN []
journey pobj of ADP [your, NS, as]
as prep journey NOUN [C.]
a det C. PROPN []
Pes compound C. PROPN []
C. pobj as ADP [a, Pes]
After mark 're VERB []
you nsubj 're VERB []
're advcl be VERB [After, you, in]
in prep 're VERB [unit]
your poss unit NOUN []
unit pobj in ADP [your]
you nsubj miss VERB []
'll aux miss VERB []
miss ROOT miss VERB [be, you, 'll, camp

In [30]:
displacy.render(nlp("4 weeks bmt, 5 weeks vocational training. Just relax. BMT will probably be the best part of your NS journey as a Pes C. After you're in your unit you'll miss kranji camp and the 4 hours per day you spend ironing bed and talking cock."), style='dep', jupyter = True, options = {'distance': 120})

In [28]:
spacy.explain("nmod")

'modifier of nominal'

In [15]:
from spacy.symbols import nsubj, VERB
nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")

# Finding a verb with a subject from below — good
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
print(verbs)

{shift}


In [19]:
# Finding a verb with a subject from above — less good
verbs = []
for possible_verb in doc:
    if possible_verb.pos == VERB:
        for possible_subject in possible_verb.children:
            if possible_subject.dep == nsubj:
                verbs.append(possible_verb)
                break

In [31]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Credit and mortgage account holders must submit their requests")

root = [token for token in doc if token.head == token][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
    assert subject is descendant or subject.is_ancestor(descendant)
    print(descendant.text, descendant.dep_, descendant.n_lefts,
            descendant.n_rights,
            [ancestor.text for ancestor in descendant.ancestors])

Credit nmod 0 2 ['holders', 'submit']
and cc 0 0 ['Credit', 'holders', 'submit']
mortgage compound 0 0 ['account', 'Credit', 'holders', 'submit']
account conj 1 0 ['Credit', 'holders', 'submit']
holders nsubj 1 0 ['submit']


### using dependency parse in information extraction, esp. when combined with other predictions like named entities (LOOKS INCREDIBLY USEFUL)

The following example extracts money and currency values, i.e. entities labeled as MONEY, and then uses the dependency parse to find the noun phrase they are referring to – for example "Net income"→ "$9.4 million".

In [32]:
nlp = spacy.load("en_core_web_sm")
# Merge noun phrases and entities for easier analysis
nlp.add_pipe("merge_entities")
nlp.add_pipe("merge_noun_chunks")

TEXTS = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]

for doc in nlp.pipe(TEXTS):
    for token in doc:
        if token.ent_type_ == "MONEY":
            # We have an attribute and direct object, so check for subject
            if token.dep_ in ("attr", "dobj"):
                subj = [w for w in token.head.lefts if w.dep_ == "nsubj"]
                if subj:
                    print(subj[0], "-->", token)
            # We have a prepositional object with a preposition
            elif token.dep_ == "pobj" and token.head.dep_ == "prep":
                print(token.head.head, "-->", token)

Net income --> $9.4 million
the prior year --> $2.7 million
Revenue --> twelve billion dollars
a loss --> 1b


In [None]:
#nlp = spacy.load("en_core_web_sm", disable=["parser"]) #disable if you don't need parser, ie don't need syntatic info

### named entity recognition

In [34]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [35]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("San Francisco considers banning sidewalk delivery robots")

# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)  # ['San', 'B', 'GPE']
print(ent_francisco)  # ['Francisco', 'I', 'GPE']

[('San Francisco', 0, 13, 'GPE')]
['San', 'B', 'GPE']
['Francisco', 'I', 'GPE']


you can set custom entity annotations (at the document level) WE WANT THIS YES WE DO

In [42]:
import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
doc = nlp("fb is hiring a new vice president of global policy")
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('Before', ents)
# The model didn't recognize "fb" as an entity :(

# Create a span for the new entity
fb_ent = Span(doc, 0, 1, label="ORG")
#fb_ent = doc.char_span(0, 2, label="ORG") #alternative to the above line

# Option 1: Modify the provided entity spans, leaving the rest unmodified
#doc.set_ents([fb_ent], default="unmodified")

# Option 2: Assign a complete list of ents to doc.ents
doc.ents = list(doc.ents) + [fb_ent]

ents = [(e.text, e.start, e.end, e.label_) for e in doc.ents]
print('After', ents)
# [('fb', 0, 1, 'ORG')] 🎉

Before []
After [('fb', 0, 1, 'ORG')]


In [46]:
import numpy
import spacy
from spacy.attrs import ENT_IOB, ENT_TYPE

nlp = spacy.load("en_core_web_sm")
doc = nlp.make_doc("London is a big city in the United Kingdom.")
print("Before", doc.ents)  # []

header = [ENT_IOB, ENT_TYPE]
attr_array = numpy.zeros((len(doc), len(header)), dtype="uint64")
print(attr_array)

Before ()
[[0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]]


In [47]:
attr_array[0, 0] = 3  # B
attr_array[0, 1] = doc.vocab.strings["GPE"]
print(attr_array)

[[  3 384]
 [  0   0]
 [  0   0]
 [  0   0]
 [  0   0]
 [  0   0]
 [  0   0]
 [  0   0]
 [  0   0]
 [  0   0]]


In [48]:
doc.from_array(header, attr_array)
print("After", doc.ents)  # [London]

After (London,)


spaCy provides functionality to perform entity linking, which resolves a textual entity to a unique identifier from a knowledge base (KB). You can create your own KnowledgeBase and train a new EntityLinker using that custom knowledge base.

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Ada Lovelace was born in London")

# Document level
ents = [(e.text, e.label_, e.kb_id_) for e in doc.ents]
print(ents)  # [('Ada Lovelace', 'PERSON', 'Q7259'), ('London', 'GPE', 'Q84')]

# Token level
ent_ada_0 = [doc[0].text, doc[0].ent_type_, doc[0].ent_kb_id_]
ent_ada_1 = [doc[1].text, doc[1].ent_type_, doc[1].ent_kb_id_]
ent_london_5 = [doc[5].text, doc[5].ent_type_, doc[5].ent_kb_id_]
print(ent_ada_0)  # ['Ada', 'PERSON', 'Q7259']
print(ent_ada_1)  # ['Lovelace', 'PERSON', 'Q7259']
print(ent_london_5)  # ['London', 'GPE', 'Q84']

### Tokenisation

In [6]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


### Rule-based matching

compared to training a statistical entity recognition model

#### TOKEN MATCHING

In [9]:
# NOTE: can rely on this: https://explosion.ai/demos/matcher

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab) # initialise matcher with a vocab; matcher must alwyas share same vocab with the documents it will operate on

# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "Hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", [pattern]) #This is all up to you and can be defined individually for each pattern, by passing in a callback function as the on_match argument on add().

doc = nlp("Hello, world! Hello world!")
matches = matcher(doc) # 'matches' is a list of (match_id, start, end) tuples; in this case, there's only 1 item in the list

for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, world


In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab) # initialise matcher with a vocab; matcher must alwyas share same vocab with the documents it will operate on

# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", [pattern]) #This is all up to you and can be defined individually for each pattern, by passing in a callback function as the on_match argument on add().

doc = nlp("Hello, world! Hello world!")
matches = matcher(doc) # 'matches' is a list of (match_id, start, end) tuples; in this case, there's only 1 item in the list

for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

on match ### THIS IS WHAT WE WANT

In [13]:
from spacy import displacy
doc = nlp("This is a text about Google I/O")
html = displacy.render(doc, style="ent", page=True,
                       options={"ents": ["EVENT"]})



In [42]:
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.tokens import Span

nlp = English()
matcher = Matcher(nlp.vocab)

def add_event_ent(matcher, doc, i, matches): #callback function
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="EVENT")
    doc.ents += (entity,)
    print(entity.text)

pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
matcher.add("GoogleIO", [pattern], on_match=add_event_ent)
doc = nlp("This is a text about Google I/O")
matches = matcher(doc)

Google I/O


In [43]:
from spacy import displacy
html = displacy.render(doc, style="ent", page=True,
                       options={"ents": ["EVENT"]})

In [17]:
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.tokens import Span

nlp = English()
matcher = Matcher(nlp.vocab)

def add_event_ent(matcher, doc, i, matches): #callback function
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="TRAINING")
    doc.ents += (entity,)
    print(entity.text)

pattern1 = [{"ORTH": "vocational"}, {"ORTH": "training"}]
pattern2 = [{"LOWER": "bmt"}]
matcher.add("training", [pattern1, pattern2], on_match=add_event_ent)
doc = nlp("4 weeks bmt, 5 weeks vocational training. Just relax. BMT will probably be the best part of your NS journey as a Pes C. After you're in your unit you'll miss kranji camp and the 4 hours per day you spend ironing bed and talking cock.")
matches = matcher(doc)

bmt
vocational training
BMT


In [18]:
from spacy import displacy
html = displacy.render(doc, style="ent", page=True,
                       options={"ents": ["TRAINING"]})

creating spans from matches

In [11]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span

nlp = spacy.blank("en")
matcher = Matcher(nlp.vocab)
matcher.add("PERSON", [[{"lower": "barack"}, {"lower": "obama"}]])
doc = nlp("Barack Obama was the 44th president of the United States")

# 1. Return (match_id, start, end) tuples
matches = matcher(doc)
for match_id, start, end in matches:
    # Create the matched span and assign the match_id as a label
    span = Span(doc, start, end, label=match_id)
    print(span.text, span.label_)

# 2. Return Span objects directly (available in spaCy v3)
matches = matcher(doc, as_spans=True)
for span in matches:
    print(span.text, span.label_)

Barack Obama PERSON
Barack Obama PERSON


In [None]:
from spacy import displacy
html = displacy.render(doc, style="ent", page=True,
                       options={"ents": ["TRAINING"]})

In [13]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span

nlp = spacy.blank("en")
matcher = Matcher(nlp.vocab)
matcher.add("TRAINING", [[{"LOWER": "vocational"}, {"LOWER": "training"}]])
doc = nlp("4 weeks bmt, 5 weeks vocational training. Just relax. BMT will probably be the best part of your NS journey as a Pes C. After you're in your unit you'll miss kranji camp and the 4 hours per day you spend ironing bed and talking cock.")

# 1. Return (match_id, start, end) tuples
matches = matcher(doc)
for match_id, start, end in matches:
    # Create the matched span and assign the match_id as a label
    span = Span(doc, start, end, label=match_id)
    print(span.text, span.label_)

# 2. Return Span objects directly (available in spaCy v3)
matches = matcher(doc, as_spans=True)
for span in matches:
    print(span.text, span.label_) # created spans, not entities

vocational training TRAINING
vocational training TRAINING


In [14]:
from spacy import displacy
html = displacy.render(doc, style="ent", page=True,
                       options={"ents": ["TRAINING"]})



Interesting use case: Let’s say you’re analyzing user comments and you want to find out what people are saying about Facebook. You want to start off by finding adjectives following “Facebook is” or “Facebook was”. This is obviously a very rudimentary solution, but it’ll be fast, and a great way to get an idea for what’s in your data.

In [27]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
matched_sents = []  # Collect data of matched sentences to be visualized

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]  # Matched span
    sent = span.sent  # Sentence containing matched span
    # Append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
    match_ents = [{
        "start": span.start_char - sent.start_char,
        "end": span.end_char - sent.start_char,
        "label": "MATCH",
    }]
    matched_sents.append({"text": sent.text, "ents": match_ents})

pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
           {"POS": "ADJ"}]
matcher.add("FacebookIs", [pattern], on_match=collect_sents)  # add pattern
doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?")
matches = matcher(doc)

# Serve visualization of sentences containing match with displaCy
# set manual=True to make displaCy render straight from a dictionary
# (if you're not running the code within a Jupyer environment, you can
# use displacy.serve instead)
displacy.render(matched_sents, style="ent", manual=True)

Another interesting use case: 

Let’s say you’ve extracted a large sample of social media posts on a specific topic, for example posts mentioning a brand name or product. As the first step of your data exploration, you want to filter out posts containing certain emoji and use them to assign a general sentiment score, based on whether the expressed emotion is positive or negative, e.g. 😀 or 😞. 

You also want to find, merge and label hashtags like #MondayMotivation, to be able to ignore or analyze them later. By default, spaCy’s tokenizer will split emoji into separate tokens. This means that you can create a pattern for one or more emoji tokens. Valid hashtags usually consist of a #, plus a sequence of ASCII characters with no whitespace, making them easy to match as well.

SOMETHING INTERESTING: you’ll also want to take specific words into account and check the subtree for intensifiers like “very”, to increase the sentiment score. At some point, you might also want to train a sentiment model.

In [28]:
from spacy.lang.en import English
from spacy.matcher import Matcher

nlp = English()  # We only want the tokenizer, so no need to load a pipeline
matcher = Matcher(nlp.vocab)

pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]  # Positive emoji
neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"]  # Negative emoji

# Add patterns to match one or more emoji tokens
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]

# Function to label the sentiment
def label_sentiment(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id] == "HAPPY":  # Don't forget to get string!
        doc.sentiment += 0.1  # Add 0.1 for positive sentiment
    elif doc.vocab.strings[match_id] == "SAD":
        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment

matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)  # Add positive pattern
matcher.add("SAD", neg_patterns, on_match=label_sentiment)  # Add negative pattern

# Add pattern for valid hashtag, i.e. '#' plus any ASCII token
matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]])

doc = nlp("Hello world 😀 #MondayMotivation")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = doc.vocab.strings[match_id]  # Look up string ID
    span = doc[start:end]
    print(string_id, span.text)

HAPPY 😀
HASHTAG #MondayMotivation


In [31]:
from emojipedia import Emojipedia  # Installation: pip install emojipedia
from spacy.tokens import Span  # Get the global Span object

Span.set_extension("emoji_desc", default=None)  # Register the custom attribute

def label_sentiment(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id] == "HAPPY":  # Don't forget to get string!
        doc.sentiment += 0.1  # Add 0.1 for positive sentiment
    elif doc.vocab.strings[match_id] == "SAD":
        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
    span = doc[start:end]
    emoji = Emojipedia.search(span[0].text)  # Get data for emoji
    span._.emoji_desc = emoji.title  # Assign emoji description

In [32]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Token

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

# Add pattern for valid hashtag, i.e. '#' plus any ASCII token
matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]])

# Register token extension
Token.set_extension("is_hashtag", default=False)

doc = nlp("Hello world 😀 #MondayMotivation")
matches = matcher(doc)
hashtags = []
for match_id, start, end in matches:
    if doc.vocab.strings[match_id] == "HASHTAG":
        hashtags.append(doc[start:end])
with doc.retokenize() as retokenizer:
    for span in hashtags:
        retokenizer.merge(span)
        for token in span:
            token._.is_hashtag = True

for token in doc:
    print(token.text, token._.is_hashtag)

Hello False
world False
😀 False
#MondayMotivation True


#### PHRASE MATCHING

In [34]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]

# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

doc = nlp("German Chancellor angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")

matches = matcher(doc)

for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

angela Merkel
Barack Obama
Washington, D.C.


In [35]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
terms = ["kranji", "bedok", "khatib"]

# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

doc = nlp("4 weeks bmt, 5 weeks vocational training. Just relax. BMT will probably be the best part of your NS journey as a Pes C. After you're in your unit you'll miss kranji camp and the 4 hours per day you spend ironing bed and talking cock.")

matches = matcher(doc)

for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

kranji


#### RULE-BASED ENTITY RECOGNITION

In [36]:
#{"label": "ORG", "pattern": "Apple"} #Apple is an ORG
#{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]} #'San Francisco' is a GPE

from spacy.lang.en import English

nlp = English()
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "Apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]
ruler.add_patterns(patterns)

doc = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Apple', 'ORG'), ('San Francisco', 'GPE')]


In [37]:
import spacy

nlp = spacy.load("en_core_web_sm")
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}]
ruler.add_patterns(patterns)

doc = nlp("MyCorp Inc. is a company in the U.S.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('MyCorp Inc.', 'ORG'), ('U.S.', 'GPE')]


In [38]:
from spacy.lang.en import English

nlp = English()
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
ruler.add_patterns(patterns)

doc1 = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])

doc2 = nlp("Apple is opening its first big office in San Fran.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])

[('Apple', 'ORG', 'apple'), ('San Francisco', 'GPE', 'san-francisco')]
[('Apple', 'ORG', 'apple'), ('San Fran', 'GPE', 'san-francisco')]


In [None]:
from spacy.lang.en import English

nlp = English()
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
ruler.add_patterns(patterns)

doc1 = nlp("4 weeks bmt, 5 weeks vocational training. Just relax. BMT will probably be the best part of your NS journey as a Pes C. After you're in your unit you'll miss kranji camp and the 4 hours per day you spend ironing bed and talking cock.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])

doc2 = nlp("Apple is opening its first big office in San Fran.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])