#### Tokenization is the process of splitting text into meaningful segments.

In [1]:
import spacy

In [2]:
nlp = spacy.blank('en')

In [5]:
text = 'Dr. Samrat loves momo of Pokhara as it costs only 2$ per plate.'

In [6]:
doc = nlp(text)

for token in doc:
    print(token)

Dr.
Samrat
loves
momo
of
Pokhara
as
it
costs
only
2
$
per
plate
.


<img src='assets\tokenization.png' width=500 height=500>

In [16]:
new_doc = nlp('''"Let's go to N.Y.!"''')
for token in new_doc:
    print(token)

"
Let
's
go
to
N.Y.
!
"


In [24]:
span = doc[1:6]
doc[1:6]

Samrat loves momo of Pokhara

In [23]:
type(nlp), type(doc), type(token), type(span)

(spacy.lang.en.English,
 spacy.tokens.doc.Doc,
 spacy.tokens.token.Token,
 spacy.tokens.span.Span)

In [25]:
doc = nlp('Tony gave two $ to Peter.')
token = doc[0]
token

Tony

In [26]:
token.is_alpha

True

In [27]:
token.like_num

False

In [29]:
token = doc[2]
token.text

'two'

In [30]:
token.like_num

True

In [31]:
token = doc[3]
token.text

'$'

In [32]:
token.is_currency

True

In [37]:
for token in doc:
    print(token, '\t\t index:', token.i, 'is_alpha:', token.is_alpha, 
          'like_punct:', token.is_punct, 'like_num:', token.like_num, 
          'is_currency:', token.is_currency)

Tony 		 index: 0 is_alpha: True like_punct: False like_num: False is_currency: False
gave 		 index: 1 is_alpha: True like_punct: False like_num: False is_currency: False
two 		 index: 2 is_alpha: True like_punct: False like_num: True is_currency: False
$ 		 index: 3 is_alpha: False like_punct: False like_num: False is_currency: True
to 		 index: 4 is_alpha: True like_punct: False like_num: False is_currency: False
Peter 		 index: 5 is_alpha: True like_punct: False like_num: False is_currency: False
. 		 index: 6 is_alpha: False like_punct: True like_num: False is_currency: False


In [40]:
with open('assets/tokenization.txt') as f:
    text = f.readlines()

text

['Harvard University\n',
 '\n',
 'Name        Birth day        email\n',
 '----        ---------        -----\n',
 'Samrat      5 June, 1990     samrat@metal.com\n',
 'Killer      6 Jan, 1880      killer@bee.com\n',
 'Yeti        17 May, 2000     mr@yeti.com\n',
 'Ubermensch  10 Feb, 1992     ubermensch@superman.com']

In [41]:
text = ' '.join(text)
text



In [42]:
doc = nlp(text)
emails = []

for token in doc:
    if token.like_email:
        emails.append(token.text)

emails        

['samrat@metal.com',
 'killer@bee.com',
 'mr@yeti.com',
 'ubermensch@superman.com']

In [43]:
nlp = spacy.blank('ne')

In [60]:
text = 'नमस्ते सबैला। आशा छ सबै जना सन्चै हुनुहुन्छ होला। म पनि यता आरामै छु! हजुरको नाम क हो? मसंग रू. ५ छ। '

In [61]:
doc = nlp(text)

for token in doc:
    print(token, '\t\t index:', token.i, 'is_alpha:', token.is_alpha, 
          'like_punct:', token.is_punct, 'like_num:', token.like_num, 
          'is_currency:', token.is_currency)

नमस्ते 		 index: 0 is_alpha: False like_punct: False like_num: False is_currency: False
सबैला 		 index: 1 is_alpha: False like_punct: False like_num: False is_currency: False
। 		 index: 2 is_alpha: False like_punct: True like_num: False is_currency: False
आशा 		 index: 3 is_alpha: False like_punct: False like_num: False is_currency: False
छ 		 index: 4 is_alpha: True like_punct: False like_num: True is_currency: False
सबै 		 index: 5 is_alpha: False like_punct: False like_num: False is_currency: False
जना 		 index: 6 is_alpha: False like_punct: False like_num: False is_currency: False
सन्चै 		 index: 7 is_alpha: False like_punct: False like_num: False is_currency: False
हुनुहुन्छ 		 index: 8 is_alpha: False like_punct: False like_num: False is_currency: False
होला 		 index: 9 is_alpha: False like_punct: False like_num: False is_currency: False
। 		 index: 10 is_alpha: False like_punct: True like_num: False is_currency: False
म 		 index: 11 is_alpha: True like_punct: False like_num: Fa

In [63]:
doc = nlp('gimme double cheese extra large healthy pizza')

tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [64]:
from spacy.symbols import ORTH

In [66]:
nlp.tokenizer.add_special_case('gimme', [
    {ORTH: 'gim'},
    {ORTH: 'me'}
])

In [67]:
doc = nlp('gimme double cheese extra large healthy pizza')

tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [69]:
nlp.pipe_names

[]

In [70]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x2553ba13dd0>

In [71]:
nlp.pipe_names

['sentencizer']

In [72]:
doc = nlp('Dr. Strange loves momo of pokhara. Hulk loves yomari of kathmandu.')

for sentence in doc.sents:
    print(sentence)

Dr.
Strange loves momo of pokhara.
Hulk loves yomari of kathmandu.


### **Exercise**

**1. Extract all the urls**

In [73]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

In [75]:
doc = nlp(text)

tokens = [token.text for token in doc if token.like_url]
tokens

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

**2. Extract all the money transaction from below sentence along with currency**

In [84]:
nlp = spacy.load('en_core_web_sm')

In [85]:
text = 'Tony gave two $ to Peter, Bruce gave 500 € to Steve.'

In [86]:
doc = nlp(text)

for token in doc:
    if token.like_num and doc[token.i+1].is_currency:
        print(token.text, doc[token.i+1].text)  

two $
500 €
