### Bags of words

In [1]:
from nltk.tokenize import TreebankWordTokenizer

In [53]:
sentence = "The faster Harry got to the store, the faster Harry, the faster, would get home."
tokenizer = TreebankWordTokenizer()
tokens = tokernizer.tokenize(sentence.lower())
tokens = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None, tokens))

In [54]:
from collections import Counter
bag_of_words = Counter(tokens)
bag_of_words

Counter({'the': 4,
         'faster': 3,
         'harry': 2,
         'got': 1,
         'to': 1,
         'store': 1,
         'would': 1,
         'get': 1,
         'home': 1})

In [55]:
bag_of_words.most_common(4)

[('the', 4), ('faster', 3), ('harry', 2), ('got', 1)]

The number of times a word occurs in a given document is called the term frequency (TF)

In [56]:
times_harry_appears = bag_of_words['harry']
num_unique_words = len(tokens)
tf = round(times_harry_appears/num_unique_words, 4)

In [57]:
tf

0.1333

### Use term frequency to infer content of paragraph

In [58]:
from nlpia.data.loaders import kite_text
tokens = tokenizer.tokenize(kite_text.lower())
token_counts = Counter(tokens)

In [59]:
import nltk
nltk.download('stopwords', quiet=True)

True

In [60]:
stopwords = nltk.corpus.stopwords.words('english')
tokens = [x for x in tokens if x not in stopwords and x not in "(){},.;:'"]

In [61]:
kite_counts = Counter(tokens)
kite_counts.most_common(10)

[('kite', 16),
 ('kites', 8),
 ('wing', 5),
 ('lift', 4),
 ('may', 4),
 ('also', 3),
 ('kiting', 3),
 ('flown', 3),
 ('tethered', 2),
 ('craft', 2)]

In [62]:
document_vector = []
doc_length = len(tokens)
for key, value in kite_counts.most_common():
    document_vector.append(value / doc_length)
    
print(document_vector)

[0.07960199004975124, 0.03980099502487562, 0.024875621890547265, 0.01990049751243781, 0.01990049751243781, 0.014925373134328358, 0.014925373134328358, 0.014925373134328358, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.009950248756218905, 0.004975124378109453, 0.004975124378109453, 0.004975124378109453, 0.004975124378109453, 0.004975124378109453, 0.004975124378109453, 0.004975124378109453, 0.004975124378109453, 0.004975124378109453, 0.004975124378109453, 0.004975124378109453, 0.004975124378109453, 0.004975124378109453, 0.004975124378109453, 0.004975124378109453, 0.004975124378109453, 0.00497512437

In [119]:
from nlpia.data.loaders import harry_docs as docs

In [64]:
doc_tokens = []
for doc in docs:
    doc_tokens += [sorted(tokenizer.tokenize(doc.lower()))]
len(doc_tokens[0])

17

In [65]:
all_doc_tokens = sum(doc_tokens, [])
len(all_doc_tokens)

33

In [66]:
lexicon = sorted(set(all_doc_tokens))
len(lexicon)

18

In [67]:
lexicon

[',',
 '.',
 'and',
 'as',
 'faster',
 'get',
 'got',
 'hairy',
 'harry',
 'home',
 'is',
 'jill',
 'not',
 'store',
 'than',
 'the',
 'to',
 'would']

In [68]:
from collections import OrderedDict
zero_vector = OrderedDict((token, 0) for token in lexicon)
zero_vector

OrderedDict([(',', 0),
             ('.', 0),
             ('and', 0),
             ('as', 0),
             ('faster', 0),
             ('get', 0),
             ('got', 0),
             ('hairy', 0),
             ('harry', 0),
             ('home', 0),
             ('is', 0),
             ('jill', 0),
             ('not', 0),
             ('store', 0),
             ('than', 0),
             ('the', 0),
             ('to', 0),
             ('would', 0)])

In [70]:
import copy
doc_vectors = []
for doc in docs:
    vec = copy.copy(zero_vector)
    tokens = tokenizer.tokenize(doc.lower())
    token_counts = Counter(tokens)
    for key, value in token_counts.items():
        vec[key] = value / len(lexicon)
    doc_vectors.append(vec)

In [71]:
doc_vectors

[OrderedDict([(',', 0.05555555555555555),
              ('.', 0.05555555555555555),
              ('and', 0.05555555555555555),
              ('as', 0),
              ('faster', 0.16666666666666666),
              ('get', 0.05555555555555555),
              ('got', 0.05555555555555555),
              ('hairy', 0),
              ('harry', 0.1111111111111111),
              ('home', 0.05555555555555555),
              ('is', 0),
              ('jill', 0),
              ('not', 0),
              ('store', 0.05555555555555555),
              ('than', 0),
              ('the', 0.16666666666666666),
              ('to', 0.05555555555555555),
              ('would', 0.05555555555555555)]),
 OrderedDict([(',', 0),
              ('.', 0.05555555555555555),
              ('and', 0.05555555555555555),
              ('as', 0),
              ('faster', 0.05555555555555555),
              ('get', 0),
              ('got', 0),
              ('hairy', 0.05555555555555555),
              ('harry', 0.05

### Use cosine similarity to evaluate similarity

In [79]:
import math
import numpy as np
def cosine_sim(vec1, vec2):
    vec1 = [val for val in vec1.values()]
    vec2 = [val for val in vec2.values()]
    
    dot_prod = np.dot(vec1, vec2)
    
    mag1 = np.sqrt(np.sum(np.square(vec1)))
    mag2 = np.sqrt(np.sum(np.square(vec2)))
    
    return dot_prod/(mag1 * mag2)

In [81]:
cosine_sim(doc_vectors[0], doc_vectors[1])

0.4445004445006667

### Zipf's law
Given some corpus of natural language utterances, the frequency of any word is inversely proportional to its rank in the frequency table.

In [83]:
import matplotlib.pyplot as plt
% matplotlib inline

In [123]:
nltk.download('brown')

[nltk_data] Downloading package brown to /Users/sli/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [124]:
from nltk.corpus import brown

In [126]:
len(brown.words())

1161192

In [127]:
from collections import Counter
puncs = [',', '.', '--', '-', '!', '?', ':', ';', '``', "''", '(', ')', '[', ']']
word_list = [x.lower() for x in brown.words() if x not in puncs]
token_counts = Counter(word_list)
token_counts.most_common(20)

[('the', 69971),
 ('of', 36412),
 ('and', 28853),
 ('to', 26158),
 ('a', 23195),
 ('in', 21337),
 ('that', 10594),
 ('is', 10109),
 ('was', 9815),
 ('he', 9548),
 ('for', 9489),
 ('it', 8760),
 ('with', 7289),
 ('as', 7253),
 ('his', 6996),
 ('on', 6741),
 ('be', 6377),
 ('at', 5372),
 ('by', 5306),
 ('i', 5164)]

### Topic modeling
Inverse Document Frequency, IDF

In [133]:
from nlpia.data.loaders import kite_text, kite_history
kite_intro = kite_text.lower()
intro_tokens = tokenizer.tokenize(kite_intro)
intro_tokens = [x for x in intro_tokens if x not in puncs]
kite_history = kite_history.lower()
history_tokens = tokenizer.tokenize(kite_history)
history_tokens = [x for x in history_tokens if x not in puncs]
intro_total = len(intro_tokens)

In [134]:
intro_total

342

In [135]:
hist_total = len(history_tokens)
hist_total

264

In [137]:
intro_tf = {}
hist_tf = {}
intro_counts = Counter(intro_tokens)
intro_tf['kite'] = intro_counts['kite']/intro_total
hist_counts = Counter(intro_tokens)
hist_tf['kite'] = hist_counts['kite']/hist_total

In [139]:
print('Term Frequency of "kite" in intro is: {:.4f}'.format(intro_tf['kite']))
print('Term Frequency of "kite" in history is: {:.4f}'.format(hist_tf['kite']))

Term Frequency of "kite" in intro is: 0.0468
Term Frequency of "kite" in history is: 0.0606


One way to understand inverse document frequency is that if a term appear frequently in a document but occurs rarely in other documents, we could assume that this term is important to that document.

#### IDF of a term is the ratio of the total number of documents to the number of documents that the term appears in.

In [146]:
def idf_occ(target_str):
    num_docs_containing = 0
    for doc in [intro_tokens, history_tokens]:
        if target_str in doc:
            num_docs_containing += 1
    return num_docs_containing

In [150]:
intro_tf['china'] = intro_counts['china'] / intro_total
hist_tf['china'] = hist_counts['china'] / hist_total
intro_tf['and'] = intro_counts['and'] / intro_total
hist_tf['and'] = hist_counts['and'] / hist_total

In [144]:
intro_tf

{'kite': 0.04678362573099415, 'china': 0.0}

In [145]:
hist_tf

{'kite': 0.06060606060606061, 'china': 0.0}

In [148]:
idf = {}
num_doc = 2
idf['china'] = num_doc/idf_occ('china')
idf['kite'] = num_doc/idf_occ('kite')
idf['and'] = num_doc/idf_occ('and')

In [149]:
idf

{'china': 2.0, 'kite': 1.0, 'and': 1.0}

In [153]:
intro_tfidf = {}
hist_tfidf = {}
intro_tfidf['china'] = intro_tf['china'] * idf['china']
hist_tfidf['china'] = hist_tf['china'] * idf['china']
intro_tfidf['kite'] = intro_tf['kite'] * idf['kite']
hist_tfidf['kite'] = hist_tf['kite'] * idf['kite']
intro_tfidf['and'] = intro_tf['and'] * idf['and']
hist_tfidf['and'] = hist_tf['and'] * idf['and']

In [154]:
intro_tfidf, hist_tfidf

({'china': 0.0, 'kite': 0.04678362573099415, 'and': 0.029239766081871343},
 {'china': 0.0, 'kite': 0.06060606060606061, 'and': 0.03787878787878788})

Consider the idf different, we use it in log scale

### The definition becomes: 

### $tf(t, d) = \frac{num-of-occruance(t)}{total-num-words(d)}$

### $idf(t, D) = log(\frac{total-number-documents}{number-of-documents-containing t})$

### $ tfidf(t, d, D) = tf(t, d) * idf(t, D)$

The more times that a word appear in documents, the higher tfidf is going to be, while having the term, the document and documents corpus we are studying.

### Represent document with tfidf vectors

In [48]:
from nlpia.data.loaders import harry_docs as docs

In [49]:
docs

['The faster Harry got to the store, the faster and faster Harry would get home.',
 'Harry is hairy and faster than Jill.',
 'Jill is not as hairy as Harry.',
 'How long does it take to get to the store?']

In [50]:
from nltk.tokenize import RegexpTokenizer
from collections import Counter
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
document_tfidf_vectors = []
# Extract corpus of tokens
def tokenize_doc(doc):
    tokens = tokenizer.tokenize(doc)
    tokens = [word for word in tokens if word not in '.,!:;""?']
    return tokens

In [51]:
token_doc = []
for doc in docs:
    token_doc.append(tokenize_doc(doc))

In [52]:
token_counts = []
for doc in token_doc:
    token_count = {}
    token_count = Counter(doc)
    token_counts.append(token_count)

In [53]:
sum(token_count.values())

10

In [54]:
import numpy as np
def tfidf(d, D):
    """
    d: Counter of a single document
    D: list of all the documents
    """
    tf_idf = {}
    tt_word = sum(d.values())
    for word, count in d.items():
        tf_d_word = count/tt_word
        occ_word = 0
        occ_word = sum([1 if word in doc else 0 for doc in D ])
#         for single_doc in D:
#             if word in single_doc:
#                 occ_word+=1
        idf_word = np.log(len(D)/occ_word)
        tf_idf[word] = tf_d_word * idf_word
    return tf_idf

In [55]:
tf_idf = []
for i in range(len(token_counts)):
    tf_idf.append(tfidf(token_counts[i], docs))

In [56]:
Counter(tfidf(token_counts[0], docs)).most_common()

[('faster', 0.13862943611198905),
 ('The', 0.09241962407465937),
 ('got', 0.09241962407465937),
 ('the', 0.09241962407465937),
 ('would', 0.09241962407465937),
 ('home', 0.09241962407465937),
 ('to', 0.046209812037329684),
 ('store', 0.046209812037329684),
 ('and', 0.046209812037329684),
 ('get', 0.046209812037329684),
 ('Harry', 0.03835760966023745)]

In [57]:
Counter(tfidf(token_counts[1], docs)).most_common()

[('than', 0.19804205158855578),
 ('is', 0.09902102579427789),
 ('hairy', 0.09902102579427789),
 ('and', 0.09902102579427789),
 ('faster', 0.09902102579427789),
 ('Jill', 0.09902102579427789),
 ('Harry', 0.04109743892168297)]

In [58]:
Counter(tfidf(token_counts[2], docs)).most_common()

[('not', 0.19804205158855578),
 ('Jill', 0.09902102579427789),
 ('is', 0.09902102579427789),
 ('hairy', 0.09902102579427789),
 ('as', 0.08219487784336595),
 ('Harry', 0.04109743892168297)]

In [59]:
corpus = tokenize_doc(' '.join(docs))

In [60]:
import pandas as pd

df_tfidf = pd.DataFrame(data=None, columns=corpus)

In [61]:
df_tfidf

Unnamed: 0,The,faster,Harry,got,to,the,store,the.1,faster.1,and,...,How,long,does,it,take,to.1,get,to.2,the.2,store.1


In [62]:
for i in range(len(token_counts)):
    single_entry = pd.Series(tf_idf[i])
    df_tfidf = df_tfidf.append(single_entry, ignore_index=True)
df_tfidf.fillna(0, inplace=True)

In [63]:
df_tfidf

Unnamed: 0,The,faster,Harry,got,to,the,store,the.1,faster.1,and,...,How,long,does,it,take,to.1,get,to.2,the.2,store.1
0,0.09242,0.138629,0.038358,0.09242,0.04621,0.09242,0.04621,0.09242,0.138629,0.04621,...,0.0,0.0,0.0,0.0,0.0,0.04621,0.04621,0.04621,0.09242,0.04621
1,0.0,0.099021,0.041097,0.0,0.0,0.0,0.0,0.0,0.099021,0.099021,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.041097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.138629,0.069315,0.069315,0.069315,0.0,0.0,...,0.138629,0.138629,0.138629,0.138629,0.138629,0.138629,0.069315,0.138629,0.069315,0.069315


In [64]:
query = "How long does it take to get to the store?"
token_query = tokenizer.tokenize(query)
token_query = [i for i in token_query if i not in ".,;?"]
token_query = Counter(token_query)

In [65]:
import copy
docs2 = copy.copy(docs)
docs2.append(query)
query_tfidf = tfidf(token_query, docs2)

In [66]:
query_tfidf

{'How': 0.09162907318741552,
 'long': 0.09162907318741552,
 'does': 0.09162907318741552,
 'it': 0.09162907318741552,
 'take': 0.09162907318741552,
 'to': 0.10216512475319815,
 'get': 0.051082562376599076,
 'the': 0.051082562376599076,
 'store': 0.051082562376599076}

In [67]:
corpus = tokenize_doc(' '.join(docs2))
df_tfidf2 = pd.DataFrame(data=None, columns=corpus)
for i in range(len(token_counts)):
    single_entry = pd.Series(tf_idf[i])
    df_tfidf2 = df_tfidf2.append(single_entry, ignore_index=True)

In [68]:
single_entry = pd.Series(query_tfidf)
df_tfidf2 = df_tfidf2.append(single_entry, ignore_index=True)
df_tfidf2.fillna(0, inplace=True)

In [69]:
df_tfidf2

Unnamed: 0,The,faster,Harry,got,to,the,store,the.1,faster.1,and,...,How,long,does,it,take,to.1,get,to.2,the.2,store.1
0,0.09242,0.138629,0.038358,0.09242,0.04621,0.09242,0.04621,0.09242,0.138629,0.04621,...,0.0,0.0,0.0,0.0,0.0,0.04621,0.04621,0.04621,0.09242,0.04621
1,0.0,0.099021,0.041097,0.0,0.0,0.0,0.0,0.0,0.099021,0.099021,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.041097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.138629,0.069315,0.069315,0.069315,0.0,0.0,...,0.138629,0.138629,0.138629,0.138629,0.138629,0.138629,0.069315,0.138629,0.069315,0.069315
4,0.0,0.0,0.0,0.0,0.102165,0.051083,0.051083,0.051083,0.0,0.0,...,0.091629,0.091629,0.091629,0.091629,0.091629,0.102165,0.051083,0.102165,0.051083,0.051083


In [70]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(df_tfidf2)

array([[1.        , 0.41131784, 0.04334664, 0.3134397 , 0.3325787 ],
       [0.41131784, 1.        , 0.47135306, 0.        , 0.        ],
       [0.04334664, 0.47135306, 1.        , 0.        , 0.        ],
       [0.3134397 , 0.        , 0.        , 1.        , 0.99853275],
       [0.3325787 , 0.        , 0.        , 0.99853275, 1.        ]])

### Use build in libraries

In [71]:
import scipy
import sklearn

In [74]:
docs

['The faster Harry got to the store, the faster and faster Harry would get home.',
 'Harry is hairy and faster than Jill.',
 'Jill is not as hairy as Harry.',
 'How long does it take to get to the store?']

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = docs
vectorizer = TfidfVectorizer(min_df=1)
model = vectorizer.fit_transform(corpus)
print(model.todense().round(2))

[[0.18 0.   0.   0.55 0.18 0.23 0.   0.3  0.23 0.   0.   0.   0.   0.
  0.   0.18 0.   0.   0.55 0.18 0.23]
 [0.37 0.   0.   0.37 0.   0.   0.37 0.3  0.   0.   0.37 0.   0.37 0.
  0.   0.   0.   0.47 0.   0.   0.  ]
 [0.   0.74 0.   0.   0.   0.   0.29 0.24 0.   0.   0.29 0.   0.29 0.
  0.37 0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.33 0.   0.26 0.   0.   0.   0.   0.33 0.   0.33 0.   0.33
  0.   0.26 0.33 0.   0.26 0.52 0.  ]]


Limitaion of content-based search (tf-idf cosine similarity) is the contraining of using exactly the token work. More specific analysis with small corpus should based on semantics.