In [25]:
from nltk.tokenize import sent_tokenize, word_tokenize

import pandas as pd
import numpy as np

# Tokenization

In [26]:
sentence = """Affectation of Candour is common enough. One finds it everywhere. But, to be candid without ostenation or design belongs to you and you alone.
This was said to Jane by Elizabeth of the Bennett family.
"""


In [27]:
documents = sent_tokenize(sentence)
documents

['Affectation of Candour is common enough.',
 'One finds it everywhere.',
 'But, to be candid without ostenation or design belongs to you and you alone.',
 'This was said to Jane by Elizabeth of the Bennett family.']

In [28]:
vocab = word_tokenize(sentence)
vocab

['Affectation',
 'of',
 'Candour',
 'is',
 'common',
 'enough',
 '.',
 'One',
 'finds',
 'it',
 'everywhere',
 '.',
 'But',
 ',',
 'to',
 'be',
 'candid',
 'without',
 'ostenation',
 'or',
 'design',
 'belongs',
 'to',
 'you',
 'and',
 'you',
 'alone',
 '.',
 'This',
 'was',
 'said',
 'to',
 'Jane',
 'by',
 'Elizabeth',
 'of',
 'the',
 'Bennett',
 'family',
 '.']

# Stemming

In [29]:
from nltk.stem import PorterStemmer

In [30]:
#Using PorterStemmer - most widely used.
#we can also use regexpStemmer(regex words are removed) 
# or snowball stemmer(works better than porter)
stemmer = PorterStemmer()
for word in vocab:
    print(word +"-->" +stemmer.stem(word))


Affectation-->affect
of-->of
Candour-->candour
is-->is
common-->common
enough-->enough
.-->.
One-->one
finds-->find
it-->it
everywhere-->everywher
.-->.
But-->but
,-->,
to-->to
be-->be
candid-->candid
without-->without
ostenation-->osten
or-->or
design-->design
belongs-->belong
to-->to
you-->you
and-->and
you-->you
alone-->alon
.-->.
This-->thi
was-->wa
said-->said
to-->to
Jane-->jane
by-->by
Elizabeth-->elizabeth
of-->of
the-->the
Bennett-->bennett
family-->famili
.-->.


# Lemmatization

In [31]:
from nltk import WordNetLemmatizer

In [32]:
#Compared to stemming, lemmatization takes time
lemmatizer = WordNetLemmatizer()

for word in vocab:
    print(word +"-->" +lemmatizer.lemmatize(word))


Affectation-->Affectation
of-->of
Candour-->Candour
is-->is
common-->common
enough-->enough
.-->.
One-->One
finds-->find
it-->it
everywhere-->everywhere
.-->.
But-->But
,-->,
to-->to
be-->be
candid-->candid
without-->without
ostenation-->ostenation
or-->or
design-->design
belongs-->belongs
to-->to
you-->you
and-->and
you-->you
alone-->alone
.-->.
This-->This
was-->wa
said-->said
to-->to
Jane-->Jane
by-->by
Elizabeth-->Elizabeth
of-->of
the-->the
Bennett-->Bennett
family-->family
.-->.


# removing stopwords

In [33]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91910\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

In [41]:
document = """Harry Potter is a series of seven fantasy novels written by British author J. K. Rowling.
 The novels chronicle the lives of a young wizard, Harry Potter, and his friends, Ron Weasley and Hermione Granger, all of whom are students at Hogwarts School of Witchcraft 
 and Wizardry. The main story arc concerns Harry's conflict with Lord Voldemort, a dark wizard who intends to become immortal, overthrow the wizard governing body known as 
 the Ministry of Magic, and subjugate all wizards and Muggles (non-magical people)."""

words = word_tokenize(document.lower())
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print(len(lemmatized_words))

for lemma in lemmatized_words:
    if lemma in stop_words or lemma == '.':
        lemmatized_words.remove(lemma)
print(len(lemmatized_words))
    
lemmatized_words

97
72


['harry',
 'potter',
 'series',
 'seven',
 'fantasy',
 'novel',
 'written',
 'british',
 'author',
 'j.',
 'k.',
 'rowling',
 'novel',
 'chronicle',
 'life',
 'young',
 'wizard',
 ',',
 'harry',
 'potter',
 ',',
 'his',
 'friend',
 ',',
 'ron',
 'weasley',
 'hermione',
 'granger',
 ',',
 'are',
 'student',
 'hogwarts',
 'school',
 'witchcraft',
 'wizardry',
 'the',
 'main',
 'story',
 'arc',
 'concern',
 'harry',
 "'s",
 'conflict',
 'lord',
 'voldemort',
 ',',
 'a',
 'dark',
 'wizard',
 'intends',
 'become',
 'immortal',
 ',',
 'overthrow',
 'the',
 'wizard',
 'governing',
 'body',
 'known',
 'a',
 'the',
 'ministry',
 'of',
 'magic',
 ',',
 'subjugate',
 'wizard',
 'muggles',
 '(',
 'non-magical',
 'people',
 ')']

# Text to Vector techniques

bag of words