In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.utils import to_categorical


In [4]:
def load_glove_embedding(file_path):
    embedding_index = {}
    with open(file_path, 'r', encoding= 'utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            coefficient = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = coefficient
        return embedding_index

In [7]:
max_words = 1000
max_length = 100
embd_dim = 100
num_classes = 2

In [8]:
texts = ['Sample text data for good text classification', 'bad sample text data part two']
labels = [0,1]

In [9]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
word_sequence = tokenizer.texts_to_sequences(texts)
num_sequence = pad_sequences(word_sequence, maxlen = max_length)
labels = to_categorical(labels, num_classes=num_classes)

In [10]:
file_path = "Dataset/glove.6B.100d.txt"
embedding_index = load_glove_embedding(file_path=file_path)

In [14]:
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index)+1, embd_dim), dtype=float)
for word,i in word_index.items():
    if i > len(word_index):
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [18]:
model = Sequential()
model.add(Embedding(len(word_index)+1, embd_dim, weights = [embedding_matrix], input_length = max_length, trainable = False))
model.add(SimpleRNN(128))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [19]:
model.fit(num_sequence, labels, epochs=5, batch_size=2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x227dfc72f10>

In [20]:
test_tokens = ['testing example']
test_word_sequence = tokenizer.texts_to_sequences(test_tokens)
test_num_sequence = pad_sequences(test_word_sequence, maxlen = max_length)

prediction = model.predict(test_num_sequence)
print(np.argmax(prediction, axis=1))

[0]
