In [30]:
import spacy

In [31]:
#A custom component is something that we need to do to the doc object that we cant do with normal spacy

#Lets create a custom pipe that will never give an object entity as GPE always LOC or nothing
nlp=spacy.load("en_core_web_sm")
doc=nlp("Britain is a place. Mary is a person")
for ent in doc.ents:
    print(ent)
    print(ent.text,ent.label_)

Britain
Britain GPE
Mary
Mary PERSON


In [32]:
#To fulfill the goal lets first import languages
from spacy.language import Language
@Language.component("remove_gpe_custom")
def remove_gpe_custom(doc):
    original_ents=list(doc.ents)
    for ent in original_ents:
        if ent.label_=="GPE":
            original_ents.remove(ent)
    doc.ents=original_ents
    print(doc.ents)
    return doc

#Here we defined a function remove_gpe and decleared it as a custom component. Then, we will be able to manipuate the metadata itself

#Lets add the remove_gpe pipe to the nlp object at the end of the pipeline

#nlp.add_pipe("remove_gpe_custom") #This can only be added once since we can only add same named pipe once.

nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'parser': [],
  'attribute_ruler': [],
  'lemmatizer': [],
  'ner': []},
 'att

In [33]:
#Now we reanalize the text and see the output with the custom pipe.
doc=nlp("Britain is a place. Mary is a person")
for ent in doc.ents:
    print(ent.text,ent.label_)
    print(ent)

#If we made some useful modifications to the pipe. Then, we can save it for future use by saving it to disk.
nlp.to_disk("custom_en_core_web_sm")

Britain GPE
Britain
Mary PERSON
Mary


In [34]:
#RegEx is a way of achieving complex string matching based on simple or complex patterns.
#We can apply complex rule to find or match text in very short space.
#RegEx comes prepackages with python. We can start using it by importing it.
import re

pattern= r"((\d){1,2} (January|February|March|April))"#This checks for 1 or 2 digits followed by any of these words.
text = "This is a date 2 February. Another date would be 14 April."
matches = re.findall(pattern, text)
print (matches)

#If we want to add more complex regex we can add alternative pattern rules
pattern= r"(((\d){1,2} (January|February|March|April))|((January|February|March|April) (\d){1,2}))"
text = "This is a date 2 February. Another date would be 14 April."
matches = re.findall(pattern, text)
print (matches)
text = "This is a date 2 February. Another date would be April 14."
matches = re.findall(pattern, text)
print (matches)

for match in matches:
    print(match[0])



[('2 February', '2', 'February'), ('14 April', '4', 'April')]
[('2 February', '2 February', '2', 'February', '', '', ''), ('14 April', '14 April', '4', 'April', '', '', '')]
[('2 February', '2 February', '2', 'February', '', '', ''), ('April 14', '', '', '', 'April 14', 'April', '4')]
2 February
April 14


In [35]:
#We can use finditer to get more useful information when searching for patterns in text.
text = "This is a date February 2. Another date would be 14 April."
iter_matches = re.finditer(pattern, text)
print (iter_matches)
for hit in iter_matches:
    print (hit)
    start = hit.start()
    end = hit.end()
    print (text[start:end])

<callable_iterator object at 0x7fab6e208070>
<re.Match object; span=(15, 25), match='February 2'>
February 2
<re.Match object; span=(49, 57), match='14 April'>
14 April


In [36]:
#Now lets learn to use RegEx in Spacy to implement with EntityRuler.
#Lets use RegEx to Match numbers like 555-5555 to PhoneNumber Label.

text= "This is a sampple number 333 555-5555"

nlp=spacy.blank("en")

ruler=nlp.add_pipe("entity_ruler")

patterns=[{
    "label":"PhoneNumber",
    "pattern":[{"TEXT":{"REGEX":"((\d){3}-(\d){4})"}}]
}]

ruler.add_patterns(patterns)

doc=nlp(text)

for ent in doc.ents:
    print(ent.text,ent.label_)  

#This didnt work because the '-' in the phone number confuses the entity ruler. If there was no dash it would have worked.

In [37]:
#We want to use RegEx only when the patterns we want to match is independent of the linguistic features that Spacy offers.

#Also, when we use matcher to match multi-word patterns called sppan they are not automatically added to the entities. Here we will use RegEx tp extract multiword tokens and add them to entities.
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

pattern = r"Paul [A-Z]\w+" #This searches for words which conntain Paul followed by capital letter followed by tthe word break

matches = re.finditer(pattern, text)

for match in matches:
    print (match)


<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [38]:
#Now lets 
from spacy.tokens import Span

nlp=spacy.blank("en")
doc= nlp(text)
original_ents=list(doc.ents)
mwt_ents=[] #Multiword roken entities
pattern = r"Paul [A-Z]\w+"
for match in re.finditer(pattern,text):
    start,end=match.span() #This records the start and end location of the matched word.
    #This start and end consists of location of characters not tokens. So, we extract spans from this by,
    span=doc.char_span(start,end) #We are adding span using character pointers but when they are stored in span they are stored as token pointers
    print(span)
    if span is not None:
        mwt_ents.append((span.start,span.end,span.text))

print(mwt_ents) #This finally obtains the matched spans in a format used by Spacy.

for ent in mwt_ents:
    start,end,text=ent
    per_ent=Span(doc,start,end,label="PERSON") #This starts the MultiWord Span as a Entity of Label PERSON
    original_ents.append(per_ent)
doc.ents=original_ents
for ent in doc.ents:
    print(ent.text,ent.label_)


Paul Newman
Paul Hollywood
[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]
Paul Newman PERSON
Paul Hollywood PERSON


In [39]:
#For simplicity lets store this code to Spacy Object as a custom component.
@Language.component("MultiWordToken")
def MultiWordToken(doc):
    original_ents=list(doc.ents)
    mwt_ents=[] #Multiword roken entities
    for match in re.finditer(pattern,doc.text):
        start,end=match.span() #This records the start and end location of the matched word.
        #This start and end consists of location of characters not tokens. So, we extract spans from this by,
        span=doc.char_span(start,end)
        print(span)
        if span is not None:
            mwt_ents.append((span.start,span.end,span.text))

    for ent in mwt_ents:
        start,end,text=ent
        per_ent=Span(doc,start,end,label="PERSON") #This starts the MultiWord Span as a Entity of Label PERSON
        original_ents.append(per_ent)
    doc.ents=original_ents
    return doc

nlp2=spacy.blank("en")
nlp2.add_pipe("MultiWordToken")
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."
doc2=nlp2(text)
print(doc2.ents)


Paul Newman
Paul Hollywood
(Paul Newman, Paul Hollywood)
