In [10]:
# from google.colab import drive
# drive.mount('/content/drive')

# One-hot Encoding

In [11]:
sentences = [
    "It was the best of times",
    "it was the worst of times",
    "it was the age of wisdom",
    "it was the age of foolishness"
]

Tokenize all sentences.

In [12]:
tokenized_sentences = [sentence.split() for sentence in sentences]
tokenized_sentences

[['It', 'was', 'the', 'best', 'of', 'times'],
 ['it', 'was', 'the', 'worst', 'of', 'times'],
 ['it', 'was', 'the', 'age', 'of', 'wisdom'],
 ['it', 'was', 'the', 'age', 'of', 'foolishness']]

Create a vocabulary containing unique words from all sentences.

In [13]:
vocabulary = set([w for s in tokenized_sentences for w in s])
vocabulary

{'It',
 'age',
 'best',
 'foolishness',
 'it',
 'of',
 'the',
 'times',
 'was',
 'wisdom',
 'worst'}

Encode each token in a sentence by assigning 1 if the token is present in a sentence, else assigning 0.

In [14]:
def onehot_encoder(tokenized_sentence):
    return [1 if w in tokenized_sentence else 0 for w in vocabulary]

In [15]:
onehot = [onehot_encoder(tokenized_sentence)
          for tokenized_sentence in tokenized_sentences]

for (sentence, oh) in zip(sentences, onehot):
    print("%s: %s" % (oh, sentence))

[1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1]: It was the best of times
[1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]: it was the worst of times
[1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0]: it was the age of wisdom
[1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0]: it was the age of foolishness


### Out-of-vocabulary documents

#### All tokens are known

In [16]:
onehot_encoder("the age of wisdom is the best of times".split())

[1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1]

#### Some tokens are not known

This could be a problem...

In [17]:
onehot_encoder("John likes to watch movies".split())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

# Bag-of-Words Models

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

In [19]:
more_sentences = sentences + \
                 ["John likes to watch movies. Mary likes movies too.",
                  "Mary also like to watch football games."]
more_sentences

['It was the best of times',
 'it was the worst of times',
 'it was the age of wisdom',
 'it was the age of foolishness',
 'John likes to watch movies. Mary likes movies too.',
 'Mary also like to watch football games.']

In [20]:
cv.fit(more_sentences)
print(cv.get_feature_names_out())

['age' 'also' 'best' 'foolishness' 'football' 'games' 'it' 'john' 'like'
 'likes' 'mary' 'movies' 'of' 'the' 'times' 'to' 'too' 'was' 'watch'
 'wisdom' 'worst']


In [21]:
dt = cv.transform(more_sentences)
dt

<6x21 sparse matrix of type '<class 'numpy.int64'>'
	with 38 stored elements in Compressed Sparse Row format>

In [22]:
import pandas as pd
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names_out())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,like,likes,...,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0,0,1,0,0,0,1,0,0,0,...,0,1,1,1,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,1,1,1,0,0,1,0,0,1
2,1,0,0,0,0,0,1,0,0,0,...,0,1,1,0,0,0,1,0,1,0
3,1,0,0,1,0,0,1,0,0,0,...,0,1,1,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,0,2,...,2,0,0,0,1,1,0,1,0,0
5,0,1,0,0,1,1,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0


Another example:

“Oh, honey, I would walk through fire for you”

“Just let me adore you”

“Like it is the only thing I will ever do”


In [23]:
another_example = ["Oh, honey, I would walk through fire for you",
                   "Just let me adore you",
                   "Like it is the only thing I will ever do"]

Creating a vocabulary.

In [24]:
tokenized2 = [[t for t in sentence.split()] for sentence in another_example]

vocabulary2 = set([w.lower() for s in tokenized2 for w in s])
vocabulary2

{'adore',
 'do',
 'ever',
 'fire',
 'for',
 'honey,',
 'i',
 'is',
 'it',
 'just',
 'let',
 'like',
 'me',
 'oh,',
 'only',
 'the',
 'thing',
 'through',
 'walk',
 'will',
 'would',
 'you'}

In [25]:
cv = CountVectorizer(stop_words=[], vocabulary=vocabulary2).fit(another_example)
print(cv.get_feature_names_out())

['adore' 'do' 'ever' 'fire' 'for' 'honey,' 'i' 'is' 'it' 'just' 'let'
 'like' 'me' 'oh,' 'only' 'the' 'thing' 'through' 'walk' 'will' 'would'
 'you']


In [26]:
len(cv.get_feature_names_out())

22

In [27]:
dt_full = cv.transform(another_example)
pd.DataFrame(dt_full.toarray(), columns=cv.get_feature_names_out())

Unnamed: 0,adore,do,ever,fire,for,"honey,",i,is,it,just,...,me,"oh,",only,the,thing,through,walk,will,would,you
0,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,1
1,1,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
2,0,1,1,0,0,0,0,1,1,0,...,0,0,1,1,1,0,0,1,0,0


Let's also apply stopwords removal.

In [28]:
cv = CountVectorizer(stop_words='english').fit(another_example)
print(cv.get_feature_names_out())

['adore' 'honey' 'just' 'let' 'like' 'oh' 'thing' 'walk']


In [29]:
len(cv.get_feature_names_out())

8

In [30]:
dt = cv.transform(another_example)
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names_out())

Unnamed: 0,adore,honey,just,let,like,oh,thing,walk
0,0,1,0,0,0,1,0,1
1,1,0,1,1,0,0,0,0
2,0,0,0,0,1,0,1,0


# TF-IDF

In [31]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer()
tfidf_dt = tfidf.fit_transform(dt)
pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names_out())

Unnamed: 0,adore,honey,just,let,like,oh,thing,walk
0,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0,0.57735
1,0.57735,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.707107,0.0,0.707107,0.0


# Cosine Similarity

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

pd.DataFrame(cosine_similarity(dt))

Unnamed: 0,0,1,2
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,0.0,0.0,1.0
