In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

### Tokenization

In [3]:
doc = nlp('Apple isn\'t looking at buying U.K. start-up for $1 billion dollar')

In [4]:
for token in doc:
    print(token.text)

Apple
is
n't
looking
at
buying
U.K.
start
-
up
for
$
1
billion
dollar


### Parts of Speech (POS) Tagging

In [5]:
doc

Apple isn't looking at buying U.K. start-up for $1 billion dollar

In [6]:
for token in doc:
#     print(f'{token.text:{15}} {token.lemma_:{15}} {token.pos_:{15}} {token.is_stop:.>{10}}')
    print(f'{token.text:{15}} {token.lemma_:{15}} {token.pos_:{15}} {token.is_stop}')

Apple           apple           PROPN           False
is              be              VERB            True
n't             not             ADV             False
looking         look            VERB            False
at              at              ADP             True
buying          buy             VERB            False
U.K.            u.k.            PROPN           False
start           start           NOUN            False
-               -               PUNCT           False
up              up              NOUN            True
for             for             ADP             True
$               $               SYM             False
1               1               NUM             False
billion         billion         NUM             False
dollar          dollar          NOUN            False


### Dependency Parsing

In [7]:
for chunk in doc.noun_chunks:
    print(f'{chunk.text:{25}} {chunk.root.text:{15}} {chunk.root.dep_}')

Apple                     Apple           nsubj
U.K. start-up             up              dobj
$1 billion dollar         dollar          pobj


### Named Entity Recognition

In [8]:
doc

Apple isn't looking at buying U.K. start-up for $1 billion dollar

In [9]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion dollar MONEY


### Sentence Segmentation

In [10]:
doc

Apple isn't looking at buying U.K. start-up for $1 billion dollar

In [11]:
for sent in doc.sents:
    print(sent)

Apple isn't looking at buying U.K. start-up for $1 billion dollar


In [12]:
doc1 = nlp('Welcome to.*.EPITA.*.Graduate School of Computer Science')

In [13]:
for sent in doc1.sents:
    print(sent)

Welcome to.*.EPITA.*.Graduate School of Computer Science


In [14]:
def set_rule(doc):
    for token in doc[:-1]:
        if token.text == '---':
            doc[token.i + 1].is_sent_start = True
    return doc

In [16]:
# nlp.remove_pipe('set_rule')

In [17]:
# nlp.add_pipe(set_rule, before='parser')
doc1 = nlp('Welcome to---EPITA---Graduate School of Computer Science')

In [18]:
for sent in doc1.sents:
    print(sent)

Welcome to---EPITA---Graduate School of Computer Science


In [19]:
for token in doc1:
    print(token.text)

Welcome
to
---
EPITA
---
Graduate
School
of
Computer
Science


In [20]:
nlp.add_pipe(set_rule, before='parser')

In [21]:
doc1 = nlp('Welcome to---EPITA---Graduate School of Computer Science')

In [22]:
for sent in doc1.sents:
    print(sent)

Welcome to---
EPITA---
Graduate School of Computer Science


### Visualization

In [23]:
from spacy import displacy

In [24]:
doc1

Welcome to---EPITA---Graduate School of Computer Science

In [26]:
displacy.render(doc, style='dep', jupyter=True)

In [29]:
displacy.render(doc1, style='dep', jupyter=True, options={'compact': True, 'distance': 100})

In [32]:
displacy.render(doc, style='ent', jupyter=True)