In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [3]:
import keras
from keras.preprocessing.text import one_hot,Tokenizer
from keras.preprocessing.sequence import pad_sequences #converts all documents onto same length
from keras.models import Sequential
from keras.layers import Dense,Flatten,Input,Embedding
from keras.models import Model



Using TensorFlow backend.


In [4]:
#Documents
sample_txt1 = "bitty bought a bit of butter"
sample_txt2 = "but the bit of butter was a bit bitter"
sample_txt3 = "so she bought some better butter to make the bitter butter better"

In [5]:
corpus = [sample_txt1,sample_txt2,sample_txt3]
print(corpus)

['bitty bought a bit of butter', 'but the bit of butter was a bit bitter', 'so she bought some better butter to make the bitter butter better']


In [6]:
len(corpus)

3

# one_hot encoding for documents

In [13]:
voacb_size = 50 #length
encod_corpus = []
for i, doc in enumerate(corpus):
    encod_corpus.append(one_hot(doc,50))
    print(" The encoding for document",i+1,"is",one_hot(doc,50))

 The encoding for document 1 is [13, 22, 40, 30, 31, 5]
 The encoding for document 2 is [27, 43, 30, 31, 5, 20, 40, 30, 45]
 The encoding for document 3 is [14, 32, 22, 4, 1, 5, 36, 33, 43, 45, 5, 1]


In [14]:
encod_corpus

[[13, 22, 40, 30, 31, 5],
 [27, 43, 30, 31, 5, 20, 40, 30, 45],
 [14, 32, 22, 4, 1, 5, 36, 33, 43, 45, 5, 1]]

# PADDING THE DOCS (to make very doc of same length)

In [16]:
import nltk
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
stopwords = set(nltk.corpus.stopwords.words("english"))

In [18]:
maxlen=-1
for doc in corpus:
    tokens = nltk.word_tokenize(doc)#convertng sentence into tokens(words) 
    if(maxlen<len(tokens)):
        maxlen = len(tokens)
    

In [20]:
print("The max no of words in a document is : ",maxlen)

The max no of words in a document is :  12


In [23]:
# now to create embeddings all of our docs need to be of same length. hence we can pad the docs with zeros.

pad_corp = pad_sequences(encod_corpus,maxlen = maxlen,padding = 'post',value = 0.0) #adding 0 values to post index
print("No of padded documents : ",len(pad_corp))

No of padded documents :  3


In [25]:
pad_corp

array([[13, 22, 40, 30, 31,  5,  0,  0,  0,  0,  0,  0],
       [27, 43, 30, 31,  5, 20, 40, 30, 45,  0,  0,  0],
       [14, 32, 22,  4,  1,  5, 36, 33, 43, 45,  5,  1]])

In [28]:
for i,doc in enumerate(pad_corp):
    print("The padded encoding for document ",i+1," is : ",doc)

The padded encoding for document  1  is :  [13 22 40 30 31  5  0  0  0  0  0  0]
The padded encoding for document  2  is :  [27 43 30 31  5 20 40 30 45  0  0  0]
The padded encoding for document  3  is :  [14 32 22  4  1  5 36 33 43 45  5  1]


# CREATING THE EMBEDDINGS using KERAS EMBEDDING LAYER

In [36]:
# Now all the documents are of same length (after padding). And so now we are ready to create and use the embeddings.
# I will embed the words into vectors of 8 dimensions.

no_docs = len(corpus)
print(no_docs)

# each document has 12 element or words which is the value of our maxlen variable.

input = Input(shape = (no_docs,maxlen),dtype = 'float64')

3


In [35]:
word_input = Input(shape = (maxlen,),dtype = 'float64')

In [40]:
# creating the embedding
word_embedding = Embedding(input_dim = voacb_size,output_dim = 8,input_length = maxlen)(word_input)

In [47]:
print(type(word_embedding))
print(word_embedding)

<class 'tensorflow.python.framework.ops.Tensor'>
Tensor("embedding_2/embedding_lookup/Identity:0", shape=(?, 12, 8), dtype=float32)


In [48]:
#word2vec flatten
word_vec = Flatten()(word_embedding)
print(word_vec)

Tensor("flatten_2/Reshape:0", shape=(?, ?), dtype=float32)


In [49]:
#model combining all into a Keras model
embed_model = Model([word_input],word_vec)
print(embed_model)

<keras.engine.training.Model object at 0x000001CDCF348188>


In [50]:
# compiling the model. 
embed_model.compile(optimizer = keras.optimizers.Adam(lr = 1e-3),loss = 'binary_crossentropy',metrics = ['acc'])

In [51]:
print(embed_model.summary())

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 12)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 12, 8)             400       
_________________________________________________________________
flatten_2 (Flatten)          (None, 96)                0         
Total params: 400
Trainable params: 400
Non-trainable params: 0
_________________________________________________________________
None


In [55]:
# finally getting the embeddings.
embeddings = embed_model.predict(pad_corp)
print("shape of embeddings",embeddings.shape)
print(embeddings)

# 3 documents 
# 1 document 12 words and 8 dimensions = 96 vectors for one document


shape of embeddings (3, 96)
[[ 0.00422787 -0.03559115 -0.03359684 -0.03900168 -0.0229318  -0.00882538
  -0.0030782   0.01571907  0.04736512 -0.00540179 -0.02613487 -0.03620505
   0.0182913  -0.01811811 -0.03259911  0.01811392 -0.04593556  0.00836105
  -0.00622425  0.04533401 -0.04631129 -0.00973624 -0.02185645  0.01667226
   0.04427114 -0.02009405  0.01680804  0.04458213 -0.00809828  0.00193854
   0.04426659  0.0203527   0.03442018 -0.01509865  0.01738982  0.00976884
  -0.03131445  0.04668872 -0.01860714 -0.02274306  0.03158749  0.00219008
   0.00839042 -0.03041352  0.03101322 -0.04996213 -0.04922906  0.04721412
   0.00263207  0.00624193 -0.01942141 -0.02046084  0.04805979  0.00558069
   0.03871623 -0.01093327  0.00263207  0.00624193 -0.01942141 -0.02046084
   0.04805979  0.00558069  0.03871623 -0.01093327  0.00263207  0.00624193
  -0.01942141 -0.02046084  0.04805979  0.00558069  0.03871623 -0.01093327
   0.00263207  0.00624193 -0.01942141 -0.02046084  0.04805979  0.00558069
   0.03871

# The resulting shape is (3,12,8).

3---> no of documents

12---> each document is made of 12 words which was our maximum length of any document.

& 8---> each word is 8 dimensional.

In [56]:
embeddings = embeddings.reshape(-1,maxlen,8)
print("shape",embeddings.shape)# 3 rows 12 words 8 dimensions
print(embeddings) 

shape (3, 12, 8)
[[[ 0.00422787 -0.03559115 -0.03359684 -0.03900168 -0.0229318
   -0.00882538 -0.0030782   0.01571907]
  [ 0.04736512 -0.00540179 -0.02613487 -0.03620505  0.0182913
   -0.01811811 -0.03259911  0.01811392]
  [-0.04593556  0.00836105 -0.00622425  0.04533401 -0.04631129
   -0.00973624 -0.02185645  0.01667226]
  [ 0.04427114 -0.02009405  0.01680804  0.04458213 -0.00809828
    0.00193854  0.04426659  0.0203527 ]
  [ 0.03442018 -0.01509865  0.01738982  0.00976884 -0.03131445
    0.04668872 -0.01860714 -0.02274306]
  [ 0.03158749  0.00219008  0.00839042 -0.03041352  0.03101322
   -0.04996213 -0.04922906  0.04721412]
  [ 0.00263207  0.00624193 -0.01942141 -0.02046084  0.04805979
    0.00558069  0.03871623 -0.01093327]
  [ 0.00263207  0.00624193 -0.01942141 -0.02046084  0.04805979
    0.00558069  0.03871623 -0.01093327]
  [ 0.00263207  0.00624193 -0.01942141 -0.02046084  0.04805979
    0.00558069  0.03871623 -0.01093327]
  [ 0.00263207  0.00624193 -0.01942141 -0.02046084  0.0480

# GETTING ENCODING FOR A PARTICULAR WORD IN A SPECIFIC DOCUMENT

In [60]:
for i ,doc in enumerate(embeddings):
    for j ,word in enumerate(doc):
        print("The encoding for ",j+1,"th word in ",i+1,"document ",word)

The encoding for  1 th word in  1 document  [ 0.00422787 -0.03559115 -0.03359684 -0.03900168 -0.0229318  -0.00882538
 -0.0030782   0.01571907]
The encoding for  2 th word in  1 document  [ 0.04736512 -0.00540179 -0.02613487 -0.03620505  0.0182913  -0.01811811
 -0.03259911  0.01811392]
The encoding for  3 th word in  1 document  [-0.04593556  0.00836105 -0.00622425  0.04533401 -0.04631129 -0.00973624
 -0.02185645  0.01667226]
The encoding for  4 th word in  1 document  [ 0.04427114 -0.02009405  0.01680804  0.04458213 -0.00809828  0.00193854
  0.04426659  0.0203527 ]
The encoding for  5 th word in  1 document  [ 0.03442018 -0.01509865  0.01738982  0.00976884 -0.03131445  0.04668872
 -0.01860714 -0.02274306]
The encoding for  6 th word in  1 document  [ 0.03158749  0.00219008  0.00839042 -0.03041352  0.03101322 -0.04996213
 -0.04922906  0.04721412]
The encoding for  7 th word in  1 document  [ 0.00263207  0.00624193 -0.01942141 -0.02046084  0.04805979  0.00558069
  0.03871623 -0.01093327]

# practice

In [12]:
#extra stuff
for doc in enumerate(corpus):
    print(doc)
#differences between both
for i,doc in enumerate(corpus):
    print(i,doc)

(0, 'bitty bought a bit of butter')
(1, 'but the bit of butter was a bit bitter')
(2, 'so she bought some better butter to make the bitter butter better')
0 bitty bought a bit of butter
1 but the bit of butter was a bit bitter
2 so she bought some better butter to make the bitter butter better
