In [1]:
import numpy as np

In [2]:
samples = ['The cat sat on the mat.', 'the dog ate my fish?']

token_index = {}
for sample in samples:
  for word in sample.split():
    if word not in token_index:
      token_index[word] = len(token_index) + 1

max_length = 10

results = np.zeros(shape=(len(samples), max_length, max(token_index.values()) + 1))

In [3]:
results

array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0

In [4]:
for i, sample in enumerate(samples):
  for j, word in list(enumerate(sample.split()))[:max_length]:
    index = token_index.get(word)
    results[i, j, index] = 1

In [5]:
results

array([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0

#### Character Level One-hot encoding

In [6]:
import string
characters = string.printable # All printable ASCII characters

In [7]:
token_index = dict(zip(range(1, len(characters) + 1), characters))
max_length = 50

In [8]:
results = np.zeros((len(samples), max_length, max(token_index.keys()) + 1))
for i, sample in enumerate(samples):
  for j, character in enumerate(sample):
    index = token_index.get(character)
    results[i, j, index] = 1

In [9]:
results

array([[[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

### Using Keras for word-level one-hot encoding

In [10]:
from keras.preprocessing.text import Tokenizer

In [11]:
tokenizer = Tokenizer(num_words=1000) # Configured to take the 1000 most common words into account
tokenizer.fit_on_texts(samples) # builds the word index

In [12]:
sequences = tokenizer.texts_to_sequences(samples) # turns words into list of integer indices
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')

In [13]:
sequences

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]

In [14]:
one_hot_results

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [15]:
word_index = tokenizer.word_index
word_index

{'the': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'mat': 5,
 'dog': 6,
 'ate': 7,
 'my': 8,
 'fish': 9}

### Word-level encoding with hashing trick

In [16]:
dimensionality = 1000
max_length = 10

results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
  for j, word in list(enumerate(sample.split()))[:max_length]:
    index = abs(hash(word)) % dimensionality
    results[i, j, index] = 1

## Learning Word Embedding with the Embedding Layer

In [17]:
from keras.layers import Embedding
embedding_layer = Embedding(1000, 64)

In [18]:
from keras.datasets import imdb
from keras.utils import pad_sequences

In [19]:
max_features = 100000
maxlen = 200

In [20]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(
  num_words=max_features
)

In [21]:
x_train = pad_sequences(x_train, maxlen)
x_test = pad_sequences(x_test, maxlen)

In [22]:
x_train

array([[    5,    25,   100, ...,    19,   178,    32],
       [    0,     0,     0, ...,    16,   145,    95],
       [    0,     0,     0, ...,     7,   129,   113],
       ...,
       [    0,     0,     0, ...,     4,  3586, 22459],
       [    0,     0,     0, ...,    12,     9,    23],
       [    0,     0,     0, ...,   204,   131,     9]])

In [23]:
from keras.models import Sequential
from keras.layers import Flatten, Dense

model = Sequential()
model.add(Embedding(100000, 8, input_length=maxlen))
model.add(Flatten())
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 8)            800000    
                                                                 
 flatten (Flatten)           (None, 1600)              0         
                                                                 
 dense (Dense)               (None, 1)                 1601      
                                                                 
Total params: 801,601
Trainable params: 801,601
Non-trainable params: 0
_________________________________________________________________


In [24]:
history = model.fit(
  x_train, y_train,
  epochs=10,
  batch_size=32,
  validation_split=0.2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
import os

imdb_dir = "./aclImdb/aclImdb"
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label in ['neg', 'pos']:
  dir_url = os.path.join(train_dir, label)
  for fname in os.listdir(dir_url):
    if fname.endswith(".txt"):
      f = open(os.path.join(dir_url, fname), encoding="utf-8")
      texts.append(f.read())
      f.close()

    if label == 'neg':
      labels.append(0)
    else:
      labels.append(1)

In [26]:
labels

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [27]:
texts

["Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.",
 "Airport '77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belonging to rich businessman Philip Stevens (James Stewart) who is flying them & a bunch of VIP's to his estate in preparation of it being opened to the public as a museum, also on board is Stevens daughter Julie (Kathleen Quinlan) & her son.

In [28]:
maxlen = 1000
training_samples = 20000
validation_samples = 10000
max_words = 100000

In [29]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [30]:
word_index = tokenizer.word_index

In [31]:
print(f"Found {len(word_index)} unique indices")

Found 88582 unique indices


In [32]:
data = pad_sequences(sequences, maxlen)

In [33]:
labels = np.asarray(labels)

In [34]:
data.shape

(25000, 1000)

In [35]:
labels.shape

(25000,)

In [36]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [37]:
x_train = data[:training_samples]
x_test = data[training_samples:training_samples + validation_samples]

y_train = labels[:training_samples]
y_test = labels[training_samples:training_samples + validation_samples]

In [38]:
glove_path = './glove.6B'

embeddings_index = {}

with open(os.path.join(glove_path, 'glove.6B.100d.txt'), encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.array(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [39]:
print(f"Found {len(embeddings_index)} word vectors")

Found 400000 word vectors


In [40]:
embeddings_index

{'the': array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
        -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
         0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
        -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
         0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
        -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
         0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
         0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
        -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
        -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
        -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
        -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
        -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
        -1.2526  ,  0.071624,  

#### Preparing the GloVe Word Embeddings Matrix

In [41]:
embedding_dim = 100

In [42]:
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
  if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

In [43]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       [-0.071953  ,  0.23127   ,  0.023731  , ..., -0.71894997,
         0.86894   ,  0.19539   ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [44]:
model = Sequential([
  Embedding(max_words, embedding_dim, input_length=maxlen),
  Flatten(),
  Dense(32, activation='relu'),
  Dense(1, activation='sigmoid')
])

In [45]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1000, 100)         10000000  
                                                                 
 flatten_1 (Flatten)         (None, 100000)            0         
                                                                 
 dense_1 (Dense)             (None, 32)                3200032   
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 13,200,065
Trainable params: 13,200,065
Non-trainable params: 0
_________________________________________________________________


In [46]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].set_trainable = False

In [47]:
model.compile(
  optimizer='rmsprop',
  loss='binary_crossentropy',
  metrics=['acc']
)

In [48]:
history = model.fit(
  x_train, y_train,
  epochs=10, batch_size=32,
  validation_data=(x_test, y_test)
)

model.save_weights("pre_trained_glove_model.h5")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [49]:
model = Sequential([
  Embedding(max_words, embedding_dim, input_length=maxlen),
  Flatten(),
  Dense(32, activation='relu'),
  Dense(1, activation='sigmoid')
])

model.compile(
  optimizer='rmsprop',
  loss='binary_crossentropy',
  metrics=['acc']
)

history = model.fit(
  x_train, y_train,
  epochs=10, batch_size=32,
  validation_data=(x_test, y_test)
)

model.save_weights("model_1_0_0.h5")

Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: 

In [50]:
test_dir = os.path.join(imdb_dir, 'test')

labels = []
texts = []

for label in ['neg', 'pos']:
  dir_name = os.path.join(test_dir, label)
  for fname in sorted(os.listdir(dir_name)):
    if fname[-4:] == ".txt":
      with open(os.path.join(dir_name, fname), encoding="utf-8") as f:
        texts.append(f.read())
      if label == "neg":
        labels.append(0)
      else:
        labels.append(1)


In [None]:
sequences = tokenizer.texts_to_sequences(texts)
x_test = pad_sequences(sequences=sequences, maxlen=maxlen)
y_test = np.asarray(labels)

Pre-trained Model

In [None]:
model.load_weights('pre_trained_glove_model.h5')
model.evaluate(x_test, y_test)



[2.456700086593628, 0.49908000230789185]

Bespoke model

In [None]:
model.load_weights("model_1_0_0.h5")
model.evaluate(x_test, y_test)



[3.95867657661438, 0.5056399703025818]

Pseudo RNN

In [None]:
timesteps = 100
input_features = 32
output_features = 64

inputs = np.random.random((timesteps, input_features))

state_t = np.zeros((output_features,))

W = np.random.random((output_features, input_features))
U = np.random.random((output_features, output_features))
b = np.random.random((output_features,))

successive_outputs = []
for input_t in inputs:
  output_t = np.tanh(np.dot(W, input_t) + np.dot(U, state_t) + b)

  successive_outputs.append(output_t)

  state_t = output_t

final_output_sequence = np.concatenate(successive_outputs, axis=0)

In [None]:
successive_outputs

[array([0.99999782, 0.99999994, 0.99999998, 1.        , 0.99999974,
        1.        , 0.99999997, 1.        , 0.99999983, 0.99999965,
        0.99999997, 1.        , 0.99999998, 0.99999968, 0.99999724,
        0.99999998, 0.99999976, 0.99999997, 1.        , 0.99999994,
        0.99999946, 0.99999994, 0.99999995, 0.99999952, 0.99999992,
        1.        , 0.99999995, 0.99999985, 0.99999998, 1.        ,
        0.99999992, 0.99999999, 0.99999999, 1.        , 0.9999996 ,
        0.99999999, 0.99999985, 0.99999984, 0.99999999, 0.99999998,
        0.99999997, 0.99999995, 1.        , 0.99999999, 0.99999897,
        0.99999998, 0.99999999, 1.        , 0.99999997, 0.99999964,
        1.        , 0.99999951, 0.99999999, 0.99999995, 0.99999894,
        0.99999885, 0.9999999 , 0.99999971, 0.99999998, 1.        ,
        1.        , 0.99999995, 1.        , 0.99999997]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.

In [None]:
from keras.layers import SimpleRNN

In [None]:
model = Sequential([
  Embedding(max_words, embedding_dim),
  SimpleRNN(embedding_dim, return_sequences=True),
  SimpleRNN(embedding_dim, return_sequences=True),
  SimpleRNN(embedding_dim, return_sequences=True),
  SimpleRNN(embedding_dim),
])

In [None]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, None, 100)         10000000  
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, None, 100)         20100     
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, None, 100)         20100     
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, None, 100)         20100     
                                                                 
 simple_rnn_4 (SimpleRNN)    (None, 100)               20100     
                                                                 
Total params: 10,080,400
Trainable params: 10,080,400
Non-trainable params: 0
_________________________________________________________________


In [None]:
max_features = 10000
maxlen = 5000
batch_size = 32

In [None]:
(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features)

In [None]:
len(input_train)

25000

In [None]:
len(input_test)

25000

In [None]:
x_train = pad_sequences(input_train, maxlen=maxlen)
x_test = pad_sequences(input_test, maxlen=maxlen)

In [None]:
model = Sequential([
  Embedding(max_features, 32),
  SimpleRNN(32),
  Dense(1, activation='sigmoid')
])

In [None]:
model.compile(
  optimizer="rmsprop",
  loss='binary_crossentropy',
  metrics=['acc']
)

In [None]:
history = model.fit(
  x_train, y_train,
  epochs=10,
  batch_size=128,
  validation_split=0.2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from keras.layers import LSTM

In [None]:
model = Sequential([
  Embedding(max_features, 32),
  LSTM(32),
  Dense(1, activation="sigmoid")
])

In [None]:
model.compile(
  optimizer="rmsprop",
  loss = 'binary_crossentropy',
  metrics=['acc']
)

In [None]:
history = model.fit(
  x_train, y_train,
  epochs=10,
  batch_size=128,
  validation_split=0.2
)

Epoch 1/10

In [None]:
def generator(
  data, lookback, delay, min_index, max_index,
  shuffle=False, batch_size=128, step=6):

  if max_index is None:
    max_index = len(data) - delay - 1
  i = min_index + lookback
  while 1:
    if shuffle:
      rows = np.random.randomint(
        min_index + lookback, max_index, size=batch_size
      )
    else:
      if i + batch_size >= max_index:
        i = min_index + lookback
      rows = np.arange(i, min(i + batch_size, max_index))
      i += len(rows)

    samples = np.zeros((
      len(rows), lookback // step,
      data.shape[-1])
    )

    targets = np.zeros((
      len(rows),
    ))

    for j, row in enumerate(rows):
      indices = range(rows[j] - lookback, rows[j], step)
      samples[j] = data[indices]
      targets[j] = data[rows[j] + delay][1]
  
  yield samples, targets

In [None]:
lookback = 1440
step = 6
delay = 144
batch_size = 128

In [None]:
train_gen = generator(
  float_data,
  lookback,
  delay,
  0, 200000,
  True,
  step, batch_size
)