In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 

In [2]:
sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y_train = [1, 0, 0, 1, 1, 0, 1] #positive: 1  Negative: 0

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1
print("vocab size: ",vocab_size)

vocab size:  16


In [4]:
X_encoded = tokenizer.texts_to_sequences(sentences)
print("Integer Encoding : ",X_encoded)

Integer Encoding :  [[1, 2, 3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13], [14, 15]]


In [5]:
max_len = max(len(l) for l in X_encoded)
print("Maximum length: ",max_len)

Maximum length:  4


In [6]:
X_train = pad_sequences(X_encoded, maxlen=max_len, padding='post')
y_train = np.array(y_train)
print("Padded train_X data: ")
print(X_train)
print(X_train.shape)

Padded train_X data: 
[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]
(7, 4)


In [8]:
#from urllib.request import urlretrieve, urlopen
#import gzip
#import zipfile

In [12]:
embedding_dict = {}

f = open("DataSet/glove.6B.100d.txt", encoding="utf8")

In [13]:
for line in f :
    word_vector = line.split()
    word = word_vector[0]
    
    word_vector_arr = np.asarray(word_vector[1:], dtype='float32')
    embedding_dict[word] = word_vector_arr

In [16]:
f.close()

print(f"Count of embedding vector: {len(embedding_dict)}")

Count of embedding vector: 400000


In [20]:
print(embedding_dict["respectable"])
print("Ddimension of vector: ",len(embedding_dict["respectable"]))

[-0.049773   0.19903    0.10585    0.1391    -0.32395    0.44053
  0.3947    -0.22805   -0.25793    0.49768    0.15384   -0.08831
  0.0782    -0.8299    -0.037788   0.16772   -0.45197   -0.17085
  0.74756    0.98256    0.81872    0.28507    0.16178   -0.48626
 -0.006265  -0.92469   -0.30625   -0.067318  -0.046762  -0.76291
 -0.0025264 -0.018795   0.12882   -0.52457    0.3586     0.43119
 -0.89477   -0.057421  -0.53724    0.25587    0.55195    0.44698
 -0.24252    0.29946    0.25776   -0.8717     0.68426   -0.05688
 -0.1848    -0.59352   -0.11227   -0.57692   -0.013593   0.18488
 -0.32507   -0.90171    0.17672    0.075601   0.54896   -0.21488
 -0.54018   -0.45882   -0.79536    0.26331    0.18879   -0.16363
  0.3975     0.1099     0.1164    -0.083499   0.50159    0.35802
  0.25677    0.088546   0.42108    0.28674   -0.71285   -0.82915
  0.15297   -0.82712    0.022112   1.067     -0.31776    0.1211
 -0.069755  -0.61327    0.27308   -0.42638   -0.085084  -0.17694
 -0.0090944  0.1109     0.

In [22]:
embedding_matrix = np.zeros((vocab_size, 100))
print("Embedding matrix's shape: ",np.shape(embedding_matrix))

Embedding matrix's shape:  (16, 100)


In [23]:
print(tokenizer.word_index.items())

dict_items([('nice', 1), ('great', 2), ('best', 3), ('amazing', 4), ('stop', 5), ('lies', 6), ('pitiful', 7), ('nerd', 8), ('excellent', 9), ('work', 10), ('supreme', 11), ('quality', 12), ('bad', 13), ('highly', 14), ('respectable', 15)])


In [28]:
print(tokenizer.word_index['great'])

2


In [30]:
print(embedding_dict['great'])
print(embedding_dict.get("great"))

[-0.013786   0.38216    0.53236    0.15261   -0.29694   -0.20558
 -0.41846   -0.58437   -0.77355   -0.87866   -0.37858   -0.18516
 -0.128     -0.20584   -0.22925   -0.42599    0.3725     0.26077
 -1.0702     0.62916   -0.091469   0.70348   -0.4973    -0.77691
  0.66045    0.09465   -0.44893    0.018917   0.33146   -0.35022
 -0.35789    0.030313   0.22253   -0.23236   -0.19719   -0.0053125
 -0.25848    0.58081   -0.10705   -0.17845   -0.16206    0.087086
  0.63029   -0.76649    0.51619    0.14073    1.019     -0.43136
  0.46138   -0.43585   -0.47568    0.19226    0.36065    0.78987
  0.088945  -2.7814    -0.15366    0.01015    1.1798     0.15168
 -0.050112   1.2626    -0.77527    0.36031    0.95761   -0.11385
  0.28035   -0.02591    0.31246   -0.15424    0.3778    -0.13599
  0.2946    -0.31579    0.42943    0.086969   0.019169  -0.27242
 -0.31696    0.37327    0.61997    0.13889    0.17188    0.30363
 -1.2776     0.044423  -0.52736   -0.88536   -0.19428   -0.61947
 -0.10146   -0.26301  

In [34]:
for word, index in tokenizer.word_index.items() :
    vector_value = embedding_dict.get(word)
    
    if vector_value is not None :
        embedding_matrix[index] = vector_value

In [44]:
print(embedding_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.18554001  0.047152    0.34867001 ...  0.095473   -0.1142
   0.32743001]
 [-0.013786    0.38216001  0.53236002 ... -1.04260004  0.28854999
   0.63055998]
 ...
 [ 0.39456001 -0.24717     1.03190005 ...  0.0064973   0.13793001
  -0.06832   ]
 [-0.90626001  0.11363    -0.050354   ... -0.89670998 -0.059254
  -0.058493  ]
 [-0.049773    0.19903     0.10585    ... -0.19187    -0.032502
   0.38025001]]


In [47]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten

output_dim = 100

inputs = Input(shape=(4))
e = Embedding(vocab_size, output_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(inputs)
e = Flatten()(e)
output = Dense(1, activation = 'sigmoid')(e)
model = Model(inputs=inputs, outputs=output)

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
mod