In [23]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.callbacks import ModelCheckpoint
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
# define documents
docs = ["beyonc giselle knowles was born in houston texas",
		"beyonc attended st. mary elementary school in fredericksburg"]
# define class labels
labels = array([[0,0,0,1,1,0,0,0],[0,1,1,1,0,0,0,0]])
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 8
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)



Mounted at /content/gdrive
[[1, 2], [1, 3]]
[[1 2]
 [1 3]]


In [5]:
# load the whole embedding into memory
embeddings_index = dict()
file = '/content/gdrive/My Drive/BTech Project/glove.42B.300d.txt'
f = open(file)
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1917494 word vectors.


In [0]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 300))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [25]:
print(embedding_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.28489     0.60303003 -0.52146    ...  0.50771999  0.55796999
   0.097686  ]
 [ 0.45537999 -0.21951    -0.61341    ... -0.12565     0.39353999
   0.04288   ]
 [ 0.13523     0.20027     0.23433    ...  0.47446999 -0.04458
   0.098734  ]]


In [11]:
# define model
model = Sequential()
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=8, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(8, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 8, 300)            4500      
_________________________________________________________________
flatten_3 (Flatten)          (None, 2400)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 19208     
Total params: 23,708
Trainable params: 19,208
Non-trainable params: 4,500
_________________________________________________________________
None
<keras.callbacks.ModelCheckpoint object at 0x7f9bb9d273c8>
Accuracy: 100.000000


In [19]:
texts ="beyonc was in born"
t.fit_on_texts(texts)
index_list = t.texts_to_sequences(texts)
x_train = pad_sequences(index_list, maxlen=max_length, padding='post')
preds = model.predict_classes(x_train)
print(preds)

[5 6 6 3 3 3 3 5 3 1 3 6 3 3 5 3 3 3]
