In [0]:
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip install nltk

Requirement already up-to-date: spacy in /usr/local/lib/python3.6/dist-packages (2.2.4)
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


Tokenizing text

In [0]:
import spacy
nlp = spacy.load("en_core_web_sm")
text = "Mary, don't slap the green witch"
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch']


In [0]:
from nltk.tokenize import TweetTokenizer
tweet = u"Snow White and the Seven Degrees #MakeAMovieCold@midnight:-)"
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))



['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


Generating n-grams from text

In [0]:
def n_grams(text, n):
  #takes a text, returns a list of n-grams
  return [text[i:i+n] for i in range(0, len(text)-n +1)]

cleaned = ['mary', ',', "n't", 'slap', 'green', 'witch', '.']
#cleaned = ['#','m','a','r','y','#']
print(n_grams(cleaned, 3))

[['mary', ',', "n't"], [',', "n't", 'slap'], ["n't", 'slap', 'green'], ['slap', 'green', 'witch'], ['green', 'witch', '.']]


Lemmatization

In [0]:
doc = nlp(u'he was running late')
for token in doc:
  print('{} --> {}'.format(token, token.lemma_))

he --> -PRON-
was --> be
running --> run
late --> late


POS Tagging

In [0]:
doc = nlp(u"Mary slapped the green witch.")
for token in doc:
    print('{} - {}'.format(token, token.pos_))

Mary - PROPN
slapped - VERB
the - DET
green - ADJ
witch - NOUN
. - PUNCT


Noun Phrase (NP) Chunking

In [0]:
doc  = nlp(u"Mary slapped the green witch.")
for chunk in doc.noun_chunks:
    print ('{} - {}'.format(chunk, chunk.label_))


Mary - NP
the green witch - NP


Predicting Syntactic Dependencies

In [0]:
doc = nlp("She ate the pizza")
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


Named Entity Recognition

In [0]:
# Process a text
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY
