# Tokenization

Using `NLTK` library

In [3]:
# Import tokenizer
from nltk.tokenize import WordPunctTokenizer

# Tokenize the sentence
tokens = WordPunctTokenizer().tokenize("Let's eat your soup, Grandpa.")

tokens

['Let', "'", 's', 'eat', 'your', 'soup', ',', 'Grandpa', '.']

In [4]:
def wikipedia_page(title):
    '''
    This function returns the raw text of a wikipedia page 
    given a wikipedia page title
    '''
    params = { 
        'action': 'query', 
        'format': 'json', # request json formatted content
        'titles': title, # title of the wikipedia page
        'prop': 'extracts', 
        'explaintext': True
    }
    # send a request to the wikipedia api 
    response = requests.get(
         'https://en.wikipedia.org/w/api.php',
         params= params
     ).json()

    # Parse the result
    page = next(iter(response['query']['pages'].values()))
    # return the page content 
    if 'extract' in page.keys():
        return page['extract']
    else:
        return "Page not found"

Let's apply tokenizer on the 'Earth' article from the wikipedia page

In [5]:
# Import the necessary libraries
import requests
from collections import Counter

# Extract the text from the page
text = wikipedia_page('Earth').lower()

# Apply the tokenizer to the text
tokens = WordPunctTokenizer().tokenize(text)

# Print the 20 most common tokens in text
Counter(tokens).most_common(20)

[('the', 743),
 (',', 589),
 ('.', 492),
 ('of', 364),
 ('and', 288),
 ('earth', 264),
 ('is', 174),
 ('to', 167),
 ('s', 160),
 ("'", 159),
 ('in', 157),
 ('a', 144),
 ('(', 110),
 ('-', 79),
 ('by', 77),
 ('as', 76),
 ('with', 75),
 ('from', 69),
 ('surface', 63),
 ('at', 59)]

* Tokenization on characters

In [6]:
# Exapmle for character tokenization
char_tokens = [c for c in text]

# Print the 20 most common character tokens
Counter(char_tokens).most_common(20)

[(' ', 9001),
 ('e', 5595),
 ('t', 4337),
 ('a', 4156),
 ('i', 3307),
 ('o', 3245),
 ('s', 3124),
 ('r', 3068),
 ('n', 3048),
 ('h', 2248),
 ('l', 2025),
 ('c', 1588),
 ('d', 1481),
 ('m', 1306),
 ('u', 1168),
 ('f', 949),
 ('p', 942),
 ('g', 843),
 ('y', 677),
 (',', 635)]

* **N-Grams** Tokenization


In [7]:
from nltk import ngrams

text = "How much wood would a woodchuck if a woodchuck could chuck wood?"

# Tokenize - normal word tokinze
tokens = WordPunctTokenizer().tokenize(text)

# Only keep the bigrams
bigrams = [w for w in ngrams(tokens, n=2)]

bigrams

[('How', 'much'),
 ('much', 'wood'),
 ('wood', 'would'),
 ('would', 'a'),
 ('a', 'woodchuck'),
 ('woodchuck', 'if'),
 ('if', 'a'),
 ('a', 'woodchuck'),
 ('woodchuck', 'could'),
 ('could', 'chuck'),
 ('chuck', 'wood'),
 ('wood', '?')]

In [8]:
# Only trigrams
trigrams = [w for w in ngrams(tokens, n=3)]

trigrams

[('How', 'much', 'wood'),
 ('much', 'wood', 'would'),
 ('wood', 'would', 'a'),
 ('would', 'a', 'woodchuck'),
 ('a', 'woodchuck', 'if'),
 ('woodchuck', 'if', 'a'),
 ('if', 'a', 'woodchuck'),
 ('a', 'woodchuck', 'could'),
 ('woodchuck', 'could', 'chuck'),
 ('could', 'chuck', 'wood'),
 ('chuck', 'wood', '?')]

In [9]:
# Join the bigrams to make a single word
bi_tokens = ['_'.join(w) for w in bigrams]

bi_tokens

['How_much',
 'much_wood',
 'wood_would',
 'would_a',
 'a_woodchuck',
 'woodchuck_if',
 'if_a',
 'a_woodchuck',
 'woodchuck_could',
 'could_chuck',
 'chuck_wood',
 'wood_?']

In [10]:
# Similarly join the trigrams
tri_tokens = ['_'.join(w) for w in trigrams]

tri_tokens

['How_much_wood',
 'much_wood_would',
 'wood_would_a',
 'would_a_woodchuck',
 'a_woodchuck_if',
 'woodchuck_if_a',
 'if_a_woodchuck',
 'a_woodchuck_could',
 'woodchuck_could_chuck',
 'could_chuck_wood',
 'chuck_wood_?']

* Gloabl Vocabulary Size is very important for the processing speed and memory.
* To reduce the size, **remove the all the words that are too rare or too frequent**