# Text Processing using spaCy

In [1]:
import numpy as np
import pandas as pd
##Install spacy using pip (pip install spacy) then download 
#python -m spacy download en  #for english
#python -m spacy download de  #for German
#python -m spacy download es  #for spanish
#python -m spacy download pt  #for portuguese
#python -m spacy download xx  #for multi-language
#python -m spacy download en_core_web_lg
#python -m spacy download en_core_web_sm
import spacy
import os

<pre>
at the center of the spaCy is the object containing the processing pipeline. you can get that class object from respective language package in the spaCy, for english spacy.lang.en.English is the object</pre>

In [3]:
# create an english instance
nlp = spacy.load("en_core_web_lg")
#it will retuen an spaCy English object instance
nlp

<spacy.lang.en.English at 0x2174dd7fc88>

In [8]:
text = '''i don't know whether the weather will improve or not.'''

In [9]:
text

"i don't know whether the weather will improve or not."

In [10]:
#A container for accessing the annotations. It let's you access information about the text in the structured way. 
#No infomation is lost. It behaves like normal python sequence.
doc = nlp(text)

In [11]:
#You can check all the methods in https://spacy.io/api/doc
print(doc[0])
print(doc[0:5])

i
i don't know whether


In [12]:
# lets take a token
token = doc[0]

In [13]:
#you can check all the methods in https://spacy.io/api/token
print(token.i) # get the index
print(token.text) # to get text of that token
print(token.is_alpha) #to check whether it is alpha chareters or not
print(token.nbor()) #to get neighboring token
print(token.vector_norm) # Norm of word vector (Glove Vector)
print(token.similarity(token.nbor())) # similarity between words.
print(token.lemma_) #lemmatized of word

0
i
True
do
6.4231944
0.5209518
i


<pre><li><b>Lexemes - </b>These are just like tokens but context independent.</li></pre>

In [14]:
lexemes = nlp.vocab['segmentation']

In [15]:
print(lexemes.orth)# hash number 
print(lexemes.is_ascii)
print(lexemes.has_vector)

7380086911887762763
True
True


<pre>
<li>spaCy encodes all the strings into hash values to save the memory.</li></pre>

In [16]:
print(nlp.vocab.strings['segmentation'])
#you can also get in the reverse order
print(nlp.vocab.strings[7380086911887762763])

7380086911887762763
segmentation


<pre><pre><b>Tokenization</b></pre></pre>

In [17]:
print([token.text for token in doc])

['i', 'do', "n't", 'know', 'whether', 'the', 'weather', 'will', 'improve', 'or', 'not', '.']


<pre>
<img src='https://spacy.io/tokenization-57e618bd79d933c4ccd308b5739062d6.svg'>

You can customize your prefix, suffix, infix, Exceptions</pre>

In [18]:
import re

from spacy.tokenizer import Tokenizer

prefix_re = re.compile(r'''^[[("']''')
suffix_re = re.compile(r'''[])"']$''')
infix_re = re.compile(r'''[-~]''')
simple_url_re = re.compile(r'''^https?://''')

def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=simple_url_re.match)

nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = custom_tokenizer(nlp)
doc = nlp(u"hello-world.")
print([t.text for t in doc])

['hello', '-', 'world.']


In [19]:
# create an english instance
nlp = spacy.load("en_core_web_lg")
#it will retuen an spaCy English object instance
nlp

<spacy.lang.en.English at 0x18a4996f048>

In [20]:
doc = nlp(u"hello-world.")
print([t.text for t in doc])

['hello', '-', 'world', '.']


In [28]:
text = """I like Camila Cabello. She lives in New York. She recently bought a red car."""

In [29]:
nlp = spacy.load("en_core_web_lg")

In [30]:
#now we have nlp object and it contains below in that pipeline
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x18af627de10>),
 ('parser', <spacy.pipeline.DependencyParser at 0x18af618e410>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x18af617ac50>)]

In [31]:
doc = nlp(text)
print([token.text for token in doc])

['I', 'like', 'Camila', 'Cabello', '.', 'She', 'lives', 'in', 'New', 'York', '.', 'She', 'recently', 'bought', 'a', 'red', 'car', '.']


<pre>"Camila Cabello" is a person name but it came as two token. "New York" is a place name but for this also we got two tokens.  so we have to merge those.

In spaCy, nlp object will uses some methods in the pipeline as shown in below. 
<img src="https://d33wubrfki0l68.cloudfront.net/16b2ccafeefd6d547171afa23f9ac62f159e353d/48b91/pipeline-7a14d4edd18f3edfee8f34393bff2992.svg"> 
so now we have to add another method after named entity recognizer to merge those entities. In spaCy, there is an internal function called "merge_entities" and that will do the same. </pre>

In [32]:
# Create a pipe
merge_entities = nlp.create_pipe("merge_entities")
# adding it to nlp object
nlp.add_pipe(merge_entities)
#printing all methods in the pipeline
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x18af627de10>),
 ('parser', <spacy.pipeline.DependencyParser at 0x18af618e410>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x18af617ac50>),
 ('merge_entities', <function spacy.pipeline.merge_entities>)]

In [33]:
doc = nlp(text)
print([token.text for token in doc])

['I', 'like', 'Camila Cabello', '.', 'She', 'lives', 'in', 'New York', '.', 'She', 'recently', 'bought', 'a', 'red', 'car', '.']


In [None]:
#if you want to write a function, you can write like this, 
# it has to take doc object as input and has to give doc object as output.
def merge_entities(doc):
    """Merge entities into a single token."""
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            attrs = {"tag": ent.root.tag, "dep": ent.root.dep, "ent_type": ent.label}
            retokenizer.merge(ent, attrs=attrs)
    return doc

<pre> "a red car" is one word ( a noun chunk ) so we have to merge these type of words as well. In spaCy, there is an internal function called "merge_noun_chunks" and that will do the same. </pre>

In [34]:
# Create a pipe
merge_noun_chunk = nlp.create_pipe("merge_noun_chunks")
# adding it to nlp object
nlp.add_pipe(merge_noun_chunk)
#printing all methods in the pipeline
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x18af627de10>),
 ('parser', <spacy.pipeline.DependencyParser at 0x18af618e410>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x18af617ac50>),
 ('merge_entities', <function spacy.pipeline.merge_entities>),
 ('merge_noun_chunks', <function spacy.pipeline.merge_noun_chunks>)]

In [35]:
doc = nlp(text)
print([token.text for token in doc])

['I', 'like', 'Camila Cabello', '.', 'She', 'lives', 'in', 'New York', '.', 'She', 'recently', 'bought', 'a red car', '.']


<pre><pre><b>Lemmatization</b></pre></pre>

In [37]:
#you can direclt get lemmatized words from doc
print([(token.text,token.lemma_) for token in doc])

[('I', '-PRON-'), ('like', 'like'), ('Camila Cabello', 'camila cabello'), ('.', '.'), ('She', '-PRON-'), ('lives', 'live'), ('in', 'in'), ('New York', 'new york'), ('.', '.'), ('She', '-PRON-'), ('recently', 'recently'), ('bought', 'buy'), ('a red car', 'a'), ('.', '.')]


<pre><pre><b>POS Tagging</b></pre></pre>

In [38]:
print([(token.text,token.pos_) for token in doc])

[('I', 'PRON'), ('like', 'VERB'), ('Camila Cabello', 'PROPN'), ('.', 'PUNCT'), ('She', 'PRON'), ('lives', 'VERB'), ('in', 'ADP'), ('New York', 'PROPN'), ('.', 'PUNCT'), ('She', 'PRON'), ('recently', 'ADV'), ('bought', 'VERB'), ('a red car', 'NOUN'), ('.', 'PUNCT')]


<pre><pre><b>Entity Detection</b></pre></pre>

In [39]:
doc.ents

(Camila Cabello, New York)

In [40]:
[(ent, ent.label_) for ent in doc.ents]

[(Camila Cabello, 'PERSON'), (New York, 'GPE')]

In [41]:
from spacy import displacy
displacy.render(doc, style = "ent",jupyter = True) 