In [1]:
from pprint import pprint

# Tokenization

### Using NLTK

In [2]:
import nltk

In [3]:
test_sentence = "It's too cold outside, we'd be better watering our neighbour's plants tomorrow"

In [4]:
nltk.word_tokenize(test_sentence)

['It',
 "'s",
 'too',
 'cold',
 'outside',
 ',',
 'we',
 "'d",
 'be',
 'better',
 'watering',
 'our',
 'neighbour',
 "'s",
 'plants',
 'tomorrow']

### Using Spacy

In [5]:
import spacy
nlp_en = spacy.load("en_core_web_sm")

In [6]:
doc = nlp_en(test_sentence)

In [7]:
[x.text for x in doc]

['It',
 "'s",
 'too',
 'cold',
 'outside',
 ',',
 'we',
 "'d",
 'be',
 'better',
 'watering',
 'our',
 'neighbour',
 "'s",
 'plants',
 'tomorrow']

# POS tagging

### Using NLTK

In [8]:
tokens = nltk.word_tokenize(test_sentence)
nltk.pos_tag(tokens)

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('too', 'RB'),
 ('cold', 'JJ'),
 ('outside', 'JJ'),
 (',', ','),
 ('we', 'PRP'),
 ("'d", 'MD'),
 ('be', 'VB'),
 ('better', 'RB'),
 ('watering', 'VBG'),
 ('our', 'PRP$'),
 ('neighbour', 'NN'),
 ("'s", 'POS'),
 ('plants', 'NNS'),
 ('tomorrow', 'NN')]

### Using Spacy

In [9]:
[(x.text, x.pos_) for x in doc]

[('It', 'PRON'),
 ("'s", 'VERB'),
 ('too', 'ADV'),
 ('cold', 'ADJ'),
 ('outside', 'ADV'),
 (',', 'PUNCT'),
 ('we', 'PRON'),
 ("'d", 'VERB'),
 ('be', 'VERB'),
 ('better', 'ADJ'),
 ('watering', 'VERB'),
 ('our', 'ADJ'),
 ('neighbour', 'NOUN'),
 ("'s", 'PART'),
 ('plants', 'NOUN'),
 ('tomorrow', 'NOUN')]

# Lemmatization

### Using NLTK

In [10]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ

In [11]:
lemmatizer = WordNetLemmatizer()

In [12]:
tokens = nltk.word_tokenize(test_sentence)
tags = nltk.pos_tag(tokens)

In [13]:
for i, token in enumerate(tokens):
    pos_tag = tags[i][1]

    if pos_tag.startswith("N"):
        lemma = lemmatizer.lemmatize(token, pos=NOUN)
    elif pos_tag.startswith("V"):
        lemma = lemmatizer.lemmatize(token, pos=VERB)
    elif pos_tag.startswith("J"):
        lemma = lemmatizer.lemmatize(token, pos=ADJ)
    else:
        lemma = token
        
    print(lemma)

It
's
too
cold
outside
,
we
'd
be
better
water
our
neighbour
's
plant
tomorrow


### Using Spacy

In [14]:
[x.lemma_ for x in doc]

['-PRON-',
 'have',
 'too',
 'cold',
 'outside',
 ',',
 '-PRON-',
 'would',
 'be',
 'well',
 'water',
 '-PRON-',
 'neighbour',
 'have',
 'plant',
 'tomorrow']