# NOTES

# IMPORTS

In [22]:
import numpy as np
import pandas as pd

# TOKENIZATION

## Variables

In [23]:
sentence = 'Issa started to take a baby step in natural language processing and deep learning applied to computer vision at age 25.'

## Splitting

In [24]:
# create a token sequence
token_sequence = str.split(sentence)
' '.join(token_sequence)

'Issa started to take a baby step in natural language processing and deep learning applied to computer vision at age 25.'

In [25]:
# create a vocabulary list with unique tokens i.e set() method
# sorted() method sort lexographically(lexically) so numbers come before letters and capital letters come before lowercase letters 
vocabulary = sorted(set(token_sequence))
' '.join(vocabulary)

'25. Issa a age and applied at baby computer deep in language learning natural processing started step take to vision'

# VECTORIZATION

## Variables

In [26]:
num_tokens_rows = len(token_sequence)
vocab_size_column = len(vocabulary)
onehot_vectors = np.zeros((num_tokens_rows, vocab_size_column), int)

## Vectorize

### One hot encoding

In [27]:
for (i, word) in enumerate(token_sequence):
    onehot_vectors[i, vocabulary.index(word)] = 1

In [28]:
' '.join(vocabulary)

'25. Issa a age and applied at baby computer deep in language learning natural processing started step take to vision'

In [29]:
onehot_vectors

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0,

In [30]:
# construct a data frame out of vector and vocabulary
df = pd.DataFrame(onehot_vectors, columns=vocabulary)
df

Unnamed: 0,25.,Issa,a,age,and,applied,at,baby,computer,deep,in,language,learning,natural,processing,started,step,take,to,vision
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
7,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [31]:
# even better visualization
df[df == 0] = ''
df

Unnamed: 0,25.,Issa,a,age,and,applied,at,baby,computer,deep,in,language,learning,natural,processing,started,step,take,to,vision
0,,1.0,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,1.0,,,,
2,,,,,,,,,,,,,,,,,,,1.0,
3,,,,,,,,,,,,,,,,,,1.0,,
4,,,1.0,,,,,,,,,,,,,,,,,
5,,,,,,,,1.0,,,,,,,,,,,,
6,,,,,,,,,,,,,,,,,1.0,,,
7,,,,,,,,,,,1.0,,,,,,,,,
8,,,,,,,,,,,,,,1.0,,,,,,
9,,,,,,,,,,,,1.0,,,,,,,,


### Binary bag of word (binary vector)

In [32]:
sentence_bow = {}
for token in sorted( token_sequence):
    sentence_bow[token] = 1
sentence_bow

{'25.': 1,
 'Issa': 1,
 'a': 1,
 'age': 1,
 'and': 1,
 'applied': 1,
 'at': 1,
 'baby': 1,
 'computer': 1,
 'deep': 1,
 'in': 1,
 'language': 1,
 'learning': 1,
 'natural': 1,
 'processing': 1,
 'started': 1,
 'step': 1,
 'take': 1,
 'to': 1,
 'vision': 1}

In [33]:
' '.join(token_sequence)

'Issa started to take a baby step in natural language processing and deep learning applied to computer vision at age 25.'

In [34]:
# Pandas table of vectors
df1 = pd.DataFrame(pd.Series(dict(sentence_bow)), columns=['sent']).T
df1

Unnamed: 0,25.,Issa,a,age,and,applied,at,baby,computer,deep,in,language,learning,natural,processing,started,step,take,to,vision
sent,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [35]:
sentence += '\n'
sentence

'Issa started to take a baby step in natural language processing and deep learning applied to computer vision at age 25.\n'

In [36]:
sentence += "Issa then went deep in computer science field by starting a textlab as well as imagelab.\n"
sentence += 'Both imagelab and textlab make use of python as their base programming language.'
sentence

'Issa started to take a baby step in natural language processing and deep learning applied to computer vision at age 25.\nIssa then went deep in computer science field by starting a textlab as well as imagelab.\nBoth imagelab and textlab make use of python as their base programming language.'

In [37]:
# corpus
corpus = {}

for (i, sent) in enumerate(sentence.split('\n')):
    corpus['sentence{}'.format(i)] = dict((token, 1) for token in sent.split())

df2 = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
df2[df2.columns[:25]]

Unnamed: 0,Issa,started,to,take,a,baby,step,in,natural,language,...,computer,vision,at,age,25.,then,went,science,field,by
sentence0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,0,0,0,0
sentence1,1,0,0,0,1,0,0,1,0,0,...,1,0,0,0,0,1,1,1,1,1
sentence2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Measuring bag of words overlap

In [38]:
df2

Unnamed: 0,Issa,started,to,take,a,baby,step,in,natural,language,...,Both,imagelab,make,use,of,python,their,base,programming,language.
sentence0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
sentence1,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
sentence2,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1


In [39]:
df3 = df2.T
df3

Unnamed: 0,sentence0,sentence1,sentence2
Issa,1,1,0
started,1,0,0
to,1,0,0
take,1,0,0
a,1,1,0
baby,1,0,0
step,1,0,0
in,1,1,0
natural,1,0,0
language,1,0,0


In [40]:
df3.sentence0.dot(df3.sentence1)

5

In [41]:
df3.sentence0.dot(df3.sentence2)

1

In [46]:
# find shared word
shared_words = [(k, v) for (k, v) in (df3.sentence0 & df3.sentence2).items() if v]
shared_words

[('and', 1)]