## Tokenization is splitting of a paragraph into sentence, or splitting of a sentence into words.
### This will split the convo and assign token for the better understanding of the model. 
### Here we are going to implement tokenization using Spacy

In [6]:
import spacy
nlp=spacy.load('en_core_web_sm')

## Lets create a custom string

In [3]:
mystring='"We\'re moving to L.A!"'

In [4]:
# Lets print and check the text
print(mystring)

"We're moving to L.A!"


In [7]:
doc=nlp(mystring)

#### You can see it neatly convertedt the text to tokens.(like ! symbol but not . in LA)

In [16]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A
!
"


### Now we will try with some more complex Text which consits of more special characters

In [17]:
doc2=nlp(u"We're here to help! Send snail-mail, email support@gmail.com or visit us at http://www.google.com!")

In [18]:
for t in doc2:
    print(t.text)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@gmail.com
or
visit
us
at
http://www.google.com
!


#### Lets create another doc

In [19]:
doc3=nlp(u"A 5km NYC cab ride costs $10.30")

### Here km is released from 5 and $ sign from the price

In [20]:
for t1 in doc3:
    print(t1.text)

A
5
km
NYC
cab
ride
costs
$
10.30


### To check the token length

In [21]:
len(doc3)

9

### Vocab: It is nothing but vocabulary objects contains a full vocabulaulary items

In [25]:
len(doc3.vocab)

505

In [31]:
len(doc2.vocab)

505

In [33]:
doc5=nlp(u"It is better to give than recieve.")

In [35]:
### We can slice from the tokens
print(doc5[6])
print(doc5[:4])


recieve
It is better to


##### You cannot assign values to a token

In [36]:
doc[3]="hi"

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

### We can consider the entity of the word also (eg: india: Entity: Location, dollar:entity:money etc.,)

In [43]:
doc6=nlp(u"Apple to build a india factory for $6 milion")

In [44]:
### Lets take out the entity
for text in doc6.ents:
    print(text)

Apple
india
$6 milion


In [45]:
### We can even get the name of entity
for text in doc6.ents:
    print(text)
    print(text.label_)
    print('\n')

Apple
ORG


india
GPE


$6 milion
MONEY




In [46]:
### WE can go with the entity explaination
for entity in doc6.ents:
    print(entity)
    print(entity.label_)
    print(spacy.explain(entity.label_))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


india
GPE
Countries, cities, states


$6 milion
MONEY
Monetary values, including unit




### We can even get the nouns in the string

In [47]:
doc9=nlp(u"autonomous cars shift liability toward manufacturers.")

In [48]:
### Lets takethe noun chunks
for chunks in doc9.noun_chunks:
    print(chunks)

autonomous cars
liability
manufacturers


### We can visualize the tokens also like eg: What are part of speech in the dialouge, Places etc. in different styles.
### This can be done using displacy

In [49]:
from spacy import displacy

In [50]:
doc10=nlp(u"Apple is going to build U.K. factory for $6 million.")

In [53]:
displacy.render(doc10,style='dep',jupyter=True,options={'distance':50})

##### You can also visualize the entity recognizer

In [54]:
displacy.render(doc10,style='ent',jupyter=True,options={'distance':50})

In [59]:
doc11=nlp(u"Apple has sold ten thousand iPods last year with a profit of $60 million")

In [60]:
displacy.render(doc11,style='ent',jupyter=True)

In [61]:
doc12=nlp(u"leave that half quarter of whiskey for me")
displacy.render(doc12,style='ent',jupyter=True)

In [63]:
### You can use that to not normal pyscript
### port num willbe available down and use that to 127.0.0.1:portnum
displacy.serve(doc12,style='ent')

  "__main__", mod_spec)



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
