### Tokenization

A particular kind of document segmentation.
vocabulary --> lexicon

In [2]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."

In [34]:
token = sentence.split()
token

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [26]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [33]:
token_lb

array([3, 1, 6, 7, 2, 5, 9, 4, 8, 0])

In [37]:
lb = LabelEncoder()
token_lb = lb.fit_transform(token)
oh = OneHotEncoder()
token_one_hot = oh.fit_transform(token_lb.reshape(-1,1))

In [38]:
token_one_hot.toarray()

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

### Bag of word vector
Word frequency vector

In [39]:
sentence_bow = {}
for token in sentence.split():
    sentence_bow[token] = 1
sorted(sentence_bow.items())

[('26.', 1),
 ('Jefferson', 1),
 ('Monticello', 1),
 ('Thomas', 1),
 ('age', 1),
 ('at', 1),
 ('began', 1),
 ('building', 1),
 ('of', 1),
 ('the', 1)]

In [43]:
import pandas as pd
df = pd.DataFrame(pd.Series(dict([(token, 1) for token in sentence.split()])), columns=['sent']).T
df

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.
sent,1,1,1,1,1,1,1,1,1,1


In [45]:
sentences = "Construction was done mostly by local masons and carpenters.\n" \
...             "He moved into the South Pavilion in 1770.\n" \
...             "Turning Monticello into a neoclassical masterpiece was Jefferson's obsession."

In [56]:
corpus = {}
corpus['sent0'] = dict((tok.strip(''), 1) for tok in sentence.split())
for i, sent in enumerate(sentences.split('\n')):
    corpus['sent{}'.format(i + 1)] = dict((tok, 1) for tok in sent.split(' '))
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
df[df.columns[:7]]

Unnamed: 0,1770.,26.,Construction,He,Jefferson,Jefferson's,Monticello
sent0,0,1,0,0,1,0,1
sent1,0,0,1,0,0,0,0
sent2,1,0,0,1,0,0,0
sent3,0,0,0,0,0,1,1


In [59]:
# Find similarity between two senstences using dot product
df = df.T
print(df.sent0.dot(df.sent1), df.sent0.dot(df.sent2), df.sent0.dot(df.sent3))

0 1 1


In [60]:
# Identify similary word
[(k, v) for (k, v) in (df.sent0 & df.sent3).items() if v]

[('Monticello', 1)]

In [68]:
# Eliminte trailing period
import re
tokens = re.split(r"([-\s.,;!?])+", sentence)
list(filter(lambda x: x if x not in '- \t\n.,;!?' else None, tokens))

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [90]:
pattern = re.compile(r"([-\s.;,?!])+")
tokens = pattern.split(sentence)

In [93]:
[x for x in tokens if x not in ' \t\n,.;!?]']

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [100]:
from nltk.tokenize import RegexpTokenizer

In [101]:
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')

In [103]:
sentence = "Monticello wasn't designated as UNESCO World Heritage Site until 1987."
tokenizer.tokenize(sentence)

['Monticello',
 'wasn',
 "'t",
 'designated',
 'as',
 'UNESCO',
 'World',
 'Heritage',
 'Site',
 'until',
 '1987',
 '.']

In [104]:
from nltk.tokenize import TreebankWordTokenizer
sentence = "Monticello wasn't designated as UNESCO World Heritage Site until 1987."
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sentence)

['Monticello',
 'was',
 "n't",
 'designated',
 'as',
 'UNESCO',
 'World',
 'Heritage',
 'Site',
 'until',
 '1987',
 '.']

For some applications, e.g., grammer-based NLP models which is based on syntax tree, it is important to seperate was and not to allow the syntax tree parser to have a consistent, predictable set of tokens.

### How to deal with casual conversation

In [119]:
from nltk.tokenize.casual import casual_tokenize
message = "Wasn't RT @TJMonticello Best day everrrrrrr at Monticello. Awesommmmmmeeeeeeee day :*)"
message_tok1 = casual_tokenize(message, reduce_len=True, strip_handles=True)
message_tok1

["Wasn't",
 'RT',
 'Best',
 'day',
 'everrr',
 'at',
 'Monticello',
 '.',
 'Awesommmeee',
 'day',
 ':*)']

In [120]:
message2 = ' '.join(message_tok1)
tokenizer.tokenize(message2)

['Was',
 "n't",
 'RT',
 'Best',
 'day',
 'everrr',
 'at',
 'Monticello',
 '.',
 'Awesommmeee',
 'day',
 ':',
 '*',
 ')']

### n-grams
An n-gram is a sequence containing up to n elements which have been extracted from a sequence of those elements, usully a string.N-grams are one of the ways to maintain context information as data passes through our pipeline.

In [133]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."
pattern = re.compile(r"([-\s.,;!?])+")
tokens = pattern.split(sentence)
tokens = [x for x in tokens if x != '' and x not in '- \t\n.,;!?']

In [134]:
from nltk.util import ngrams
list(ngrams(tokens, 2))

[('Thomas', 'Jefferson'),
 ('Jefferson', 'began'),
 ('began', 'building'),
 ('building', 'Monticello'),
 ('Monticello', 'at'),
 ('at', 'the'),
 ('the', 'age'),
 ('age', 'of'),
 ('of', '26')]

In [135]:
list(ngrams(tokens, 3))

[('Thomas', 'Jefferson', 'began'),
 ('Jefferson', 'began', 'building'),
 ('began', 'building', 'Monticello'),
 ('building', 'Monticello', 'at'),
 ('Monticello', 'at', 'the'),
 ('at', 'the', 'age'),
 ('the', 'age', 'of'),
 ('age', 'of', '26')]

In [137]:
two_grams = list(ngrams(tokens, 2))
[" ".join(x) for x in two_grams]

['Thomas Jefferson',
 'Jefferson began',
 'began building',
 'building Monticello',
 'Monticello at',
 'at the',
 'the age',
 'age of',
 'of 26']

### Remove infrequent, too frequent words after n-grams

Given the rare occurance of single stop words and it's influence on the meaning of n-grams. Stop words are normally preserved in the.

In [139]:
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/sli/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [142]:
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

### Normalizing vocabulary
Tokens that mean similar things are combined into a single normalized form.

#### Case normalization
Although case normalization might leave to loss of informaiton, e.g., doctor v.s. Doctor. Given the significance in reducing vocabulary size, it is still worth to do it. We could control to loss of information by specifying location in the sentence.The strategy should be trying either doing or not doing case normalization.

In [143]:
tokens = ['House', 'Visitor', 'Center']
normalized_tokens = [x.lower() for x in tokens]
print(normalized_tokens)

['house', 'visitor', 'center']


#### Stemming

In [151]:
def stem(phrase):
    return ' '.join([re.findall('^(.*ss|.*?)(s)?$', word)[0][0].strip("''") for word in phrase.lower().split()])
                    
stem('houses')

'house'

In [160]:
stem("Doctor house's calls")

'doctor house call'

In [165]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
' '.join([stemmer.stem(w).strip("'") for w in "dish washer's washed dishes".split()])

'dish better wash dish'

### Lemmatization

In [166]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("better")

[nltk_data] Downloading package wordnet to /Users/sli/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


'better'

In [173]:
lemmatizer.lemmatize("best", pos="a")

'best'

Given that stemming and lemmatizing will both reduce the accuracy will increase the recall, they are not recommended for chatbot cases, where accuracy will be more important.

### Sentiment
-- Rule based sentiment analyzer: VADER

-- Machine learning based sentiment analyzer

In [174]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [188]:
sa = SentimentIntensityAnalyzer()
# sa.lexicon

In [176]:
[(tok, score) for tok, score in sa.lexicon.items() if " " in tok]

[("( '}{' )", 1.6),
 ("can't stand", -2.0),
 ('fed up', -1.8),
 ('screwed up', -1.5)]

In [177]:
sa.polarity_scores("I want to kill myself")

{'neg': 0.588, 'neu': 0.25, 'pos': 0.163, 'compound': -0.6597}

In [181]:
sa.polarity_scores("I cut myself.. This is not cute")

{'neg': 0.534, 'neu': 0.466, 'pos': 0.0, 'compound': -0.5544}

In [191]:
corpus = ["Absolutely perfect! Love it! :)))))", "Horrible! Completely useless.:(", "It was ok. Some good and some bad things"]

for doc in corpus:
    scores = sa.polarity_scores(doc)
    print('{:+}: {}'.format(scores['compound'], doc))

+0.8766: Absolutely perfect! Love it! :)))))
-0.5848: Horrible! Completely useless.:(
+0.1531: It was ok. Some good and some bad things


In [194]:
from nlpia.data.loaders import get_data
movies = get_data('hutto_movies')

In [195]:
movies.head().round(2)

Unnamed: 0_level_0,sentiment,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.27,The Rock is destined to be the 21st Century's ...
2,3.53,The gorgeously elaborate continuation of ''The...
3,-0.6,Effective but too tepid biopic
4,1.47,If you sometimes like to go to the movies to h...
5,1.73,"Emerges as something rare, an issue movie that..."


In [197]:
movies.describe().round(2)

Unnamed: 0,sentiment
count,10605.0
mean,0.0
std,1.92
min,-3.88
25%,-1.77
50%,-0.08
75%,1.83
max,3.94


In [198]:
pd.set_option('display.width', 160)

In [199]:
from nltk.tokenize import casual_tokenize

In [214]:
movies.text.iloc[0]

"The Rock is destined to be the 21st Century's new ''Conan'' and that he's going to make a splash even greater than Arnold Schwarzenegger, Jean Claud Van Damme or Steven Segal."

In [200]:
bags_of_words = []
from collections import Counter
for text in movies.text:
    bags_of_words.append(Counter(casual_tokenize(text)))
    
df_bows = pd.DataFrame.from_records(bags_of_words)
df_bows = df_bows.fillna(0).astype(int)

In [219]:
df_bows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10605 entries, 0 to 10604
Columns: 20756 entries, ! to ’
dtypes: int64(20756)
memory usage: 1.6 GB


In [203]:
df_bows.head()

Unnamed: 0,!,"""",#,$,%,&,',(,),*,...,zips,zombie,zombies,zone,zoning,zzzzzzzzz,½,élan,–,’
0,0,0,0,0,0,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [205]:
df_bows[list(bags_of_words[0].keys())].head()

Unnamed: 0,The,Rock,is,destined,to,be,the,21st,Century's,new,...,Schwarzenegger,",",Jean,Claud,Van,Damme,or,Steven,Segal,.
0,1,1,1,1,2,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,4
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,4,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [223]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb = nb.fit(df_bows, movies.sentiment > 0)
movies['predicted_sentiment'] = nb.predict(df_bows) * 8 - 4
movies['error'] = (movies.predicted_sentiment - movies.sentiment).abs()
movies.error.mean().round(1)

2.4

In [242]:
# Visualize the performance of trained model
import numpy as np
movies['target_direction'] = movies['sentiment'] > 0
movies['predicted_direction'] = movies['predicted_sentiment'] > 0
np.mean(movies['target_direction'] == movies['predicted_direction'])

0.9344648750589345

In [254]:
# Split the dataset and test in training and testing seperately
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(np.array(df_bows),np.array(movies.sentiment), test_size=0.2)

In [255]:
# Test on production dataset
nb.fit(x_train, y_train>0)
prediction = nb.predict(x_test)

In [260]:
np.mean(prediction == (y_test > 0))

0.7887788778877888

In [261]:
products = get_data('hutto_products')

In [None]:
# convert products to bags of words
