# Building Your NLP Vocabulary

## Different types of tokenizers

##### RegexpTokenizer

In [1]:
from nltk.tokenize import RegexpTokenizer
s = "A Rolex watch costs in the range of $3000.0 - $8000.0 in USA."
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
tokenizer.tokenize(s)

['A',
 'Rolex',
 'watch',
 'costs',
 'in',
 'the',
 'range',
 'of',
 '$3000.0',
 '-',
 '$8000.0',
 'in',
 'USA',
 '.']

##### BlankLine Tokenizer

In [2]:
from nltk.tokenize import BlanklineTokenizer
s = "A Rolex \n\n watch costs in the range of \n\n $3000.0 - $8000.0 in USA."
tokenizer = BlanklineTokenizer()
tokenizer.tokenize(s)

['A Rolex', 'watch costs in the range of', '$3000.0 - $8000.0 in USA.']

##### WordPunct Tokenizer

In [3]:
from nltk.tokenize import WordPunctTokenizer
s = "A Rolex watch costs in the range of $3000.0 - $8000.0 in USA."
tokenizer = WordPunctTokenizer()
tokenizer.tokenize(s)

['A',
 'Rolex',
 'watch',
 'costs',
 'in',
 'the',
 'range',
 'of',
 '$',
 '3000',
 '.',
 '0',
 '-',
 '$',
 '8000',
 '.',
 '0',
 'in',
 'USA',
 '.']

##### Treebank Tokenizer

In [4]:
from nltk.tokenize import TreebankWordTokenizer
s = "I'm going to buy a Rolex watch that doesn't cost more than $3000.0"
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(s)

['I',
 "'m",
 'going',
 'to',
 'buy',
 'a',
 'Rolex',
 'watch',
 'that',
 'does',
 "n't",
 'cost',
 'more',
 'than',
 '$',
 '3000.0']

##### Tweet Tokenizer

In [5]:
from nltk.tokenize import TweetTokenizer
s = "@amankedia I'm going to buy a Rolexxxxxxxx watch!!! :-D #happiness #rolex <3"
tokenizer = TweetTokenizer()
tokenizer.tokenize(s)

['@amankedia',
 "I'm",
 'going',
 'to',
 'buy',
 'a',
 'Rolexxxxxxxx',
 'watch',
 '!',
 '!',
 '!',
 ':-D',
 '#happiness',
 '#rolex',
 '<3']

In [6]:
#strip_handles used to remove user handles
#reduce_len reduces excessive characters
#preserve_case to convert to tokens to lower case when set to False
from nltk.tokenize import TweetTokenizer
s = "@amankedia I'm going to buy a Rolexxxxxxxx watch!!! :-D #happiness #rolex <3"
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
tokenizer.tokenize(s)

["I'm",
 'going',
 'to',
 'buy',
 'a',
 'Rolexxx',
 'watch',
 '!',
 '!',
 '!',
 ':-D',
 '#happiness',
 '#rolex',
 '<3']

## Understanding word normalization

### Stemming

##### SnowballStemmer

In [7]:
from nltk.stem.snowball import SnowballStemmer
print(SnowballStemmer.languages)

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [8]:
#words to stem
plurals = ['caresses', 'flies', 'dies', 'mules', 'died', 'agreed', 'owned', 'humbled', 'sized', 'meeting', 
           'stating', 'siezing', 'itemization', 'traditional', 'reference', 'colonizer', 'plotted',
           'having', 'generously']

##### PorterStemmer

In [9]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
singles = [stemmer.stem(plural) for plural in plurals]
print(' '.join(singles))

caress fli die mule die agre own humbl size meet state siez item tradit refer colon plot have gener


In [10]:
stemmer2 = SnowballStemmer(language='english')
singles = [stemmer2.stem(plural) for plural in plurals]
print(' '.join(singles))

caress fli die mule die agre own humbl size meet state siez item tradit refer colon plot have generous


### Lemmatization

#### Using WordNet Lemmatizer

In [11]:
import nltk
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
s = "We are putting in efforts to enhance our understanding of Lemmatization"
token_list = s.split()
print("The tokens are: ", token_list)
lemmatized_output = ' '.join([lemmatizer.lemmatize(token) for token in token_list])
print("The lemmatized output is: ", lemmatized_output)

The tokens are:  ['We', 'are', 'putting', 'in', 'efforts', 'to', 'enhance', 'our', 'understanding', 'of', 'Lemmatization']
The lemmatized output is:  We are putting in effort to enhance our understanding of Lemmatization


##### POS tagging the sentence 


#POS tagging for better lemmatization
#nltk.download('averaged_perceptron_tagger')
pos_tags = nltk.pos_tag(token_list)
pos_tags

##### Changing the POS tags into appropriate form to pass into WordLemmatizer

In [12]:
from nltk.corpus import wordnet
##This is a common method which is widely used across the NLP community of practitioners and readers
def get_part_of_speech_tags(token):
    """ Maps POS tags to first character lemmatize() accepts.
    We are focusing on Verbs, Nouns, Adjectives and Adverbs here."""
    
    tag_dict = {"J": wordnet.ADJ,"N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    tag = nltk.pos_tag([token])[0][1][0].upper()
    return tag_dict.get(tag, wordnet.NOUN)

##### Lemmatization with POS tags

In [13]:
lemmatized_output_with_POS_information = [lemmatizer.lemmatize(token,
get_part_of_speech_tags(token)) for token in token_list]
print(' '.join(lemmatized_output_with_POS_information))

We be put in effort to enhance our understand of Lemmatization


##### Comapring with original Lemmatizatio(without POS tags)

In [14]:
stemmer2 = SnowballStemmer(language='english')
stemmed_sentence = [stemmer2.stem(token) for token in token_list]
print(' '.join(stemmed_sentence))

we are put in effort to enhanc our understand of lemmat


#### Spacy Lemmatizer

In [15]:
import spacy

In [16]:
#python -m spacy download en
nlp = spacy.load('en_core_web_sm')
doc = nlp("We are putting in efforts to enhance our understanding of Lemmatization")
" ".join([token.lemma_ for token in doc])

'we be put in effort to enhance our understanding of Lemmatization'

### Stopword Removals

##### List of stopwords available for English in NLTK

In [17]:
#nltk.download('stopwords')
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
", ".join(stop)

"who, from, after, an, both, same, off, needn, have, into, haven, we, she's, them, while, each, you, me, do, against, whom, but, did, should've, re, that, themselves, m, haven't, such, for, just, shouldn't, over, the, couldn, only, he, its, more, if, or, below, don, it, her, herself, you'd, theirs, will, doesn, his, s, hasn, ain, isn, weren, mightn, be, then, once, your, most, been, are, ours, was, yours, doing, they, at, other, had, couldn't, my, it's, mustn't, above, how, didn, a, too, during, you've, and, shan, what, in, by, nor, hadn, hadn't, shouldn, all, himself, should, hasn't, you'll, won, very, own, of, yourself, to, until, there, with, you're, where, needn't, d, because, ll, o, wasn't, ve, don't, on, ourselves, through, that'll, doesn't, than, this, between, here, weren't, again, is, out, hers, were, so, am, t, ma, any, when, wouldn't, some, aren't, myself, him, mustn, yourselves, under, up, no, about, not, y, aren, she, didn't, which, having, isn't, wouldn, i, has, down, as,

##### Not removing Wh-words during stopword removal

In [18]:
wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
stop = set(stopwords.words('english'))
sentence = "how are we putting in efforts to enhance our understanding of Lemmatization"
#removing certain words from stopwords
for word in wh_words:
    stop.remove(word)
sentence_after_stopword_removal = [token for token in sentence.split() if token not in stop]
" ".join(sentence_after_stopword_removal)

'how putting efforts enhance understanding Lemmatization'

### Case Folding

##### converting sentence to lowercase

In [19]:
s = "We are putting in efforts to enhance our understanding of Lemmatization"
s = s.lower()
s

'we are putting in efforts to enhance our understanding of lemmatization'

## N-grams

##### capturing bi-grams

In [20]:
from nltk.util import ngrams
s = "Natural Language Processing is the way to go"
tokens = s.split()
bigrams = list(ngrams(tokens, 2))
[" ".join(token) for token in bigrams]

['Natural Language',
 'Language Processing',
 'Processing is',
 'is the',
 'the way',
 'way to',
 'to go']

##### capturing tri-grams

In [21]:
s = "Natural Language Processing is the way to go"
tokens = s.split()
trigrams = list(ngrams(tokens, 3))
[" ".join(token) for token in trigrams]

['Natural Language Processing',
 'Language Processing is',
 'Processing is the',
 'is the way',
 'the way to',
 'way to go']

## Taking care of HTML tags

##### Using beautifulsoup 

In [22]:
html = "<!DOCTYPE html><html><body><h1>My First Heading</h1><p>My first paragraph.</p></body></html>"
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
text = soup.get_text()
print(text)

My First HeadingMy first paragraph.


##### Forming a simple vocabulary

In [23]:
s = "Natural Language Processing is the way to go"
tokens = set(s.split())
vocabulary = sorted(tokens)
vocabulary

['Language', 'Natural', 'Processing', 'go', 'is', 'the', 'to', 'way']