# Tokenization

Tokenizatiom is the task of converting a unicode text into tokens. The output is a Doc object

In [1]:
import spacy

In [2]:
text2 ='Apple is looking for buying a U.K. startup for $1 Billion'

In [3]:
# For loading a small spacy model with English class
nlp=spacy.load('en_core_web_sm')

In [4]:
doc=nlp(text2)

In [5]:
# Iterating through the ouput document that contains a list of tokens
for i in doc:
    print(i.text)

Apple
is
looking
for
buying
a
U.K.
startup
for
$
1
Billion


# POS 

In [6]:
# doc is an iterable object that contains a list of tokens
doc

Apple is looking for buying a U.K. startup for $1 Billion

In [7]:

for i in doc:
    print(i.text,'    ',i.pos_,'    ',i.pos)

Apple      PROPN      96
is      AUX      87
looking      VERB      100
for      ADP      85
buying      VERB      100
a      DET      90
U.K.      PROPN      96
startup      NOUN      92
for      ADP      85
$      SYM      99
1      NUM      93
Billion      NUM      93


# Dependency Parsing

In [8]:
pip install displacy

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement displacy (from versions: none)
ERROR: No matching distribution found for displacy


In [9]:
from spacy import displacy

In [10]:
# below method is used for visualization
displacy.render(doc,style='dep',options={'distance':100,'compact':True})

# Named Entity Recognition

In [11]:
doc=nlp('Apple is looking for buying a U.K. startup for $1 Billion by March 2022 at 50% rate')

In [12]:
for i in doc.ents:
    print(i," ",i.label_)

Apple   ORG
U.K.   GPE
$1 Billion   MONEY
March 2022   DATE
50%   PERCENT


In [13]:
# conducting visualizations
displacy.render(doc,style='ent')

# Sentence Segmentation

In [14]:
doc=nlp('Apple is looking for buying a U.K. startup for $1 Billion. By March 2022 at 50% rate. Is it possible.')

In [15]:
for i in doc.sents:
    print(i,i.sents)

Apple is looking for buying a U.K. startup for $1 Billion. <generator object at 0x000001BD49EEB720>
By March 2022 at 50% rate. <generator object at 0x000001BD49EEB720>
Is it possible. <generator object at 0x000001BD49EEB720>


# Rule Based Matching

In [16]:
doc=nlp('Hello, worLd! Hello WORLD.Thank You')

In [17]:
from spacy.matcher import Matcher
from spacy.tokens import Span

In [18]:
# creating a matcher object
matcher=Matcher(nlp.vocab)

In [19]:
# Defining Pattern 
# IS_PUNCT is set to true, OP means optional. Thats why punctuation is optional. THe matcher will find patterns that will and will not contain punctuation in between
patt1=[{'LOWER':'hello'},{'IS_PUNCT':True,'OP':'?'},{'LOWER':'world'}]
patt2=[{'LOWER':'thank'},{'LOWER':'you'}]

In [20]:
# add the pattern to Matcher object
matcher.add('nimi', [patt1])
matcher.add('nimi', [patt2])


In [21]:
matches=matcher(doc)

In [22]:
print(matches)

[(9813159703374050914, 0, 3)]


In [23]:
for match_id,start,end in matches:
    span=doc[start:end]
    string_id=nlp.vocab.strings[match_id]
    print(match_id,start,end,span.text)

9813159703374050914 0 3 Hello, worLd


# Regular Expressions Part1

In [38]:
import re

In [50]:
text='my cell number is 123. Ohh correct phone no is 1234567890. call me.'

In [51]:
text

'my cell number is 123. Ohh correct phone no is 1234567890. call me.'

In [52]:
# r is for Raw string
re.search(r'\d{3,10}',text)   #search for a digit


<re.Match object; span=(18, 21), match='123'>

In [53]:
re.findall(r'\d',text)   #find any one digit
re.findall(r'\d{6}',text)

['123456']

In [54]:
re.findall(r'\d+',text)

['123', '1234567890']

In [55]:
#finding words
re.findall(r'\w+',text)

['my',
 'cell',
 'number',
 'is',
 '123',
 'Ohh',
 'correct',
 'phone',
 'no',
 'is',
 '1234567890',
 'call',
 'me']

# Wildcard and Exclusion Matching NLP

In [56]:
text

'my cell number is 123. Ohh correct phone no is 1234567890. call me.'

In [60]:
# finding words that start with ph and has only 3 characters afetr ph
re.findall(r'ph...',text)

['phone no is 1234567890. call me.']

In [62]:
re.findall(r'ph...+',text)  #putting + will display the rest of the sentence after the pattern that is matched

['phone no is 1234567890. call me.']

In [63]:
re.findall(r'2...',text)

['23. ', '2345']

In [68]:
# display the entire sentence but do not display any digits. ^ is for skipping that portion
re.findall(r'[^\d]+',text)

['my cell number is ', '. Ohh correct phone no is ', '. call me.']

In [70]:
# display only the digits of the sentence
re.findall(r'[^\D]+',text)

['123', '1234567890']

# Language Processing Pipeline

In [97]:
text=['net income was $9.4 million compared to the prior year of 2.7$ million. Revenue exceeds tweleve billion dollars with a loss of $1b']

In [98]:
text

['net income was $9.4 million compared to the prior year of 2.7$ million. Revenue exceeds tweleve billion dollars with a loss of $1b']

whenever we are playing with the pipeline. we need to use a fresh model of nlp

In [99]:
nlp=spacy.load('en_core_web_sm')

In [100]:
nlp

<spacy.lang.en.English at 0x1bd452e2d60>

 Now we are doing the batch proceesing and disabling the components of the pipeline that we dont need such as parser, tagger so as to save processing time.

In [103]:
docs=nlp.pipe(text,disable=['tagger','parser'])    #Divides the entire input into batches

In [107]:
%%timeit   ##timeit function calculates the time to run below code
docs=nlp.pipe(text,disable=['tagger','parser'])    #Divides the entire input into batches
for doc in docs:
    for ent in doc.ents:
        print(ent.text,ent.label_)
    print()

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ mill

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ mill

1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ mill

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ mill

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ mill

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ mill

In [108]:
%%timeit
docs=nlp.pipe(text,disable=['tagger','parser'])    #Divides the entire input into batches
for doc in docs:
    for ent in doc.ents:
        print(ent.text,ent.label_)
    print()

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ mill

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ mill

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ mill

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ mill

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ mill

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ mill

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ mill

# Hastags and Emoji Detection