# Install models and process text
```python
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm
```

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')
nlp_de = spacy.load('de_core_news_sm')

In [2]:
doc = nlp(u'Hello, world. Here are two sentences.')

print([t.text for t in doc])

['Hello', ',', 'world', '.', 'Here', 'are', 'two', 'sentences', '.']


In [3]:
doc_de = nlp_de(u'Ich bin ein Berliner.')

print([t.text for t in doc_de])

['Ich', 'bin', 'ein', 'Berliner', '.']


# Get tokens, noun chunks & sentences

In [4]:
doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "
          u"emoji. It's outranking eggplant 🍑 ")

print(doc[0].text)          # Peach
print(doc[1].text)          # emoji
print(doc[-1].text)         # 🍑
print(doc[17:19].text)      # outranking eggplant

Peach
emoji
🍑
outranking eggplant


In [5]:
noun_chunks = list(doc.noun_chunks)

print(noun_chunks[0].text)  # Peach emoji
print(noun_chunks)

Peach emoji
[Peach emoji, it, Peach, the superior emoji, It, eggplant 🍑]


In [6]:
sentences = list(doc.sents)

assert len(sentences) == 3
print(sentences)

[Peach emoji is where it has always been., Peach is the superior emoji., It's outranking eggplant 🍑]


In [7]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
apple = doc[0]

print('Fine-grained POS tag', apple.pos_, apple.pos)
print('Coarse-grained POS tag', apple.tag_, apple.tag)
print('Word shape', apple.shape_, apple.shape)
print('Alphanumeric characters?', apple.is_alpha)
print('Punctuation mark?', apple.is_punct)

Fine-grained POS tag PROPN 95
Coarse-grained POS tag NNP 15794550382381185553
Word shape Xxxxx 16072095006890171862
Alphanumeric characters? True
Punctuation mark? False


In [8]:
billion = doc[10]

print('Digit?', billion.is_digit)
print('Like a number?', billion.like_num)
print('Like an email address?', billion.like_email)

Digit? False
Like a number? True
Like an email address? False


# Use hash values for any string

In [9]:
doc = nlp(u'I love coffee')

coffee_hash = nlp.vocab.strings[u'coffee']  # 3197928453018144401
coffee_text = nlp.vocab.strings[coffee_hash]  # 'coffee'

print(coffee_hash, coffee_text)
print(doc[2].orth, coffee_hash)  # 3197928453018144401
print(doc[2].text, coffee_text)  # 'coffee'

3197928453018144401 coffee
3197928453018144401 3197928453018144401
coffee coffee


In [10]:
beer_hash = doc.vocab.strings.add(u'beer')  # 3073001599257881079
beer_text = doc.vocab.strings[beer_hash]  # 'beer'

print(beer_hash, beer_text)

3073001599257881079 beer


In [11]:
unicorn_hash = doc.vocab.strings.add(u'🦄 ')  # 18234233413267120783
unicorn_text = doc.vocab.strings[unicorn_hash]  # '🦄 '

print(unicorn_hash, unicorn_text)

17758882941175878347 🦄 


In [12]:
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

San Francisco 0 13 GPE


In [13]:
from spacy.tokens import Span

doc = nlp(u'FB is hiring a new VP of global policy')
doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

FB 0 2 ORG


# Train and update neural network models

!python -m spacy download en

```python
import random

nlp = spacy.load('en')
train_data = [("Uber blew through $1 million", {'entities': [(0, 4, 'ORG')]})]

with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'ner']):
    optimizer = nlp.begin_training()
    for i in range(10):
        random.shuffle(train_data)
        for text, annotations in train_data:
            nlp.update([text], [annotations], sgd=optimizer)

nlp.to_disk('/model')
```

# Visualize a dependency parse and named entities in your browser

In [14]:
from spacy import displacy

doc_dep = nlp(u'This is a sentence.')
displacy.serve(doc_dep, style='dep')


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer



127.0.0.1 - - [12/Jun/2018 19:39:45] "GET / HTTP/1.1" 200 3057
127.0.0.1 - - [12/Jun/2018 19:39:45] "GET /favicon.ico HTTP/1.1" 200 3057



    Shutting down server on port 5000.



In [15]:
doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google '
              u'in 2007, few people outside of the company took him seriously.')
displacy.serve(doc_ent, style='ent')


[93m    Serving on port 5000...[0m
    Using the 'ent' visualizer



127.0.0.1 - - [12/Jun/2018 19:40:10] "GET / HTTP/1.1" 200 1691
127.0.0.1 - - [12/Jun/2018 19:40:10] "GET /favicon.ico HTTP/1.1" 200 1691



    Shutting down server on port 5000.



# Get word vectors and similarity

```python
!python -m spacy download en_core_web_md
!python -m spacy download en_vectors_web_lg
```

In [16]:
# nlp = spacy.load('en_core_web_md')
# vectors_web_lg gives best result
nlp = spacy.load('en_vectors_web_lg')
doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")

apple = doc[0]
banana = doc[2]
pasta = doc[6]
hippo = doc[8]

print('apple <-> banana', apple.similarity(banana))
print('pasta <-> hippo', pasta.similarity(hippo))
print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector)

apple <-> banana 0.5831844
pasta <-> hippo 0.07934912
True True True True


# Simple and efficient serialization

https://github.com/explosion/spacy-notebooks/blob/master/notebooks/lightning_tour.ipynb