# Building your vocabulary through tokenization

In [None]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."
sentence.split()

In [None]:
import numpy as np
vocab = sorted(set(sentence.split()))
onehotwords = np.zeros((len(sentence.split()), len(vocab)), int)
for i, word in enumerate(sentence.split()):
    onehotwords[i, vocab.index(word)] = 1

print(' '.join(vocab))
onehotwords

In [None]:
sentence_bow = {}
for token in sentence.split():
    sentence_bow[token] = 1
sorted(sentence_bow.items())

In [None]:
import pandas as pd
df = pd.DataFrame(pd.Series(dict([(token, 1) for token in sentence.split()])), columns=['sent']).T
df

In [None]:
sentences = "Construction was done mostly by local masons and carpenters.\n" \
            "He moved into the South Pavilion in 1770.\n" \
            "Turning Monticello into a neoclassical masterpiece was Jefferson's obsession."
corpus = {}
corpus['sent0'] = dict((tok.strip('.'), 1) for tok in sentence.split())
for i, sent in enumerate(sentences.split('\n')):
    corpus['sent{}'.format(i + 1)] = dict((tok, 1) for tok in sent.split())
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
df[df.columns[:7]]  # show just the first 7 tokens (columns)

In [None]:
df = df.T
df.sent0.dot(df.sent1)

In [None]:
df.sent0.dot(df.sent2)

In [None]:
df.sent0.dot(df.sent3)

In [None]:
[(k, v) for (k, v) in (df.sent0 & df.sent3).items() if v]

# A Token Improvement

In [None]:
import re
sentence = "Thomas Jefferson began building Monticello at the age of 26."
tokens = re.split(r"([-\s.,;!?])+", sentence)
list(filter(lambda x: x if x not in '- \t\n.,;!?' else None, tokens))

In [None]:
[x for x in tokens if x != '' and x not in '- \t\n.,;!?']

In [None]:
pattern = re.compile(r"([-\s.,;!?])+")
tokens = pattern.split(sentence)
tokens[-10:]  # just the last 10 tokens

In [None]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."
tokens = pattern.split(sentence)
[x for x in tokens if x != '' and x not in '- \t\n.,;!?']

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(sentence)

In [None]:
from nltk.tokenize import TreebankWordTokenizer
sentence = "Monticello wasn't designated as UNESCO World Heritage Site until 1987."
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sentence)

# Tokenizing Informal Text

In [None]:
from nltk.tokenize.casual import casual_tokenize
message = "RT @TJMonticello Best day everrrrrrr at Monticello. Awesommmmmmeeeeeeee day :*)"
casual_tokenize(message)

In [None]:
casual_tokenize(message, reduce_len=True, strip_handles=True)

# n-grams

In [None]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."
pattern = re.compile(r"([-\s.,;!?])+")
tokens = pattern.split(sentence)
tokens = [x for x in tokens if x != '' and x not in '- \t\n.,;!?']
print(tokens)

In [None]:
from nltk.util import ngrams
list(ngrams(tokens, 2))

In [None]:
list(ngrams(tokens, 3))

In [None]:
two_grams = list(ngrams(tokens, 2))
[" ".join(x) for x in two_grams]

# Stopwords

In [None]:
stop_words = ['a', 'an', 'the', 'on', 'of', 'off', 'this', 'is']
tokens = ['the', 'house', 'is', 'on', 'fire']
tokens_without_stopwords = [x for x in tokens if x not in stop_words]
print(tokens_without_stopwords)


In [None]:
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
(len(stopwords), stopwords[:7])

# Case Normalization

In [None]:
tokens = ['House', 'Visitor', 'Center']
normalized_tokens = [x.lower() for x in tokens]
print(normalized_tokens)

# Stemming

In [None]:
def stem(phrase):
    return ' '.join([re.findall('^(.*ss|.*?)(s)?$', word)[0][0].strip("'") for word in phrase.lower().split()])

stem('houses')

In [None]:
stem("Doctor House's calls")

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
' '.join([stemmer.stem(w).strip("'") for w in "dish washer's washed dishes".split()])

# Lemmatization

In [None]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("better")

In [None]:
lemmatizer.lemmatize("better", pos='a')

In [None]:
lemmatizer.lemmatize("good", pos='a')

In [None]:
lemmatizer.lemmatize("goods", pos='a')

In [None]:
lemmatizer.lemmatize("goods", pos='n')

In [None]:
lemmatizer.lemmatize("goodness", pos='n')

In [None]:
lemmatizer.lemmatize("best", pos='a')

In [None]:
stemmer.stem("goodness")