In [1]:
'''
%pip install spacy
%pip install nltk

books to read:
Foundations of Statistical Natural Language Processing
Natural Language Processing with Python: Analyzing Text with the Natural Language Toolkit
Linguistic Structure prediction
Speech and Language Processing
Artificial Intelligence: A Modern Approach
Feature Engineering for Machine Learning: Principles and Techniques for Data scientists
'''
import spacy

In [2]:
# install package before using it.
# !python -m spacy download en_core_web_sm

nlp = spacy.load('en_core_web_sm')
text = "Mary, don't slap the green witch"
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch']


In [3]:
from nltk.tokenize import TweetTokenizer
tweet = u"Snow White and the Seven Degrees #MakeMovieCold@midnight:-)"
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makemoviecold', '@midnight', ':-)']


In [4]:
# n_grams
def n_grams(text, n):
    return [text[i:i+n] for i in range(len(text)-n+1)]

cleaned = ['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch']
print(n_grams(cleaned, 3))

[['mary', ',', 'do'], [',', 'do', "n't"], ['do', "n't", 'slap'], ["n't", 'slap', 'the'], ['slap', 'the', 'green'], ['the', 'green', 'witch']]


In [5]:
# word root
doc = nlp(u"he was running late")
for token in doc:
    print('{} ---> {}'.format(token, token.lemma_))

he ---> he
was ---> be
running ---> run
late ---> late


In [6]:
# word classification
doc = nlp(u"Mary slapped the green witch.")
for token in doc:
    print("{} ---> {}".format(token, token.pos_))

Mary ---> PROPN
slapped ---> VERB
the ---> DET
green ---> ADJ
witch ---> NOUN
. ---> PUNCT


In [7]:
# noun labels

for chunk in doc.noun_chunks:
    print("{} ---> {}".format(chunk, chunk.label_))

Mary ---> NP
the green witch ---> NP
