# 2 types/techniques of converting tokens to tensors:
1. One hot encoding.
2. Word embedding.

## One hot encoding

### Word level one hot encoding

In [1]:
# create a vector of n-word vector space, each word maps to a vector with one value corresponding to that word as one and others as 0.
import numpy as np
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
token_index = {} # Tokenizes input via split method
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word]=len(token_index)+1
max_len=10;
results=np.zeros(shape=(len(samples), max_len, len(token_index)+1))
# results is matrix representation of samples.
# so first dimension is no. of sample, 2nd dimension is max length of each sample that will b econsidered(max no. of words ina sample)
# and 3rd dimension is vector size of vector representation of each word.

for i,sample in enumerate(samples):
    for j, word in enumerate(sample.split()[:max_len]):
        results[i,j,token_index[word]]=1
results.shape

(2, 10, 11)

### character level hot encoding

In [2]:
# each character represented by a vector
import string 
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
characters = string.printable # all printable ascci characters, has nothing to do with samples
token_index = dict(zip(range(1, len(characters) + 1), characters))
max_len=100

results=np.zeros(shape=(len(samples), max_len, len(token_index)+1))

for i, sample in enumerate(samples):
    for j, character in enumerate(sample):
        index=token_index.get(character)
        results[i, j, index] = 1
        
results

array([[[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

### using keras for word level one hot encoding

In [3]:

from keras.preprocessing.text import Tokenizer
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
tokenizer=Tokenizer(num_words=1000) # create a tokenizer with 1000 most frequently used words only.

tokenizer.fit_on_texts(samples) # fit the tokenizer of text samples (this part is where it finds things like 1000 most common words)
sequences=tokenizer.texts_to_sequences(samples) # convert list of texts to list of integer indeces, where each index corresponds to a word
# sequences will be [[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]

one_hot_results=tokenizer.texts_to_matrix(samples, mode='binary') # directly converts list of texts to one-hot encoded matrix
#one_hot_results.shape = (2,1000)
# each sentence is a vector, with 1 at locations corresponding to words that exist in sentence, 0 at other locations.
# this is different form one_hot_encoding in 1.1.1, which contained info on placement of each word in the sentence as well.
one_hot_results
# modes can be binary, tfidf, count, freq -> last 2 will give count and frequency of each word in sentence.
# frequency of a word is= count of word in sentence/total words in sentence


word_index = tokenizer.word_index # recover the word index that was copied
print('Found %s unique tokens.' % len(word_index), word_index)

Using TensorFlow backend.


Found 9 unique tokens. {'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'dog': 6, 'ate': 7, 'my': 8, 'homework': 9}


## one-hot hashing

In [4]:
from keras.preprocessing.text import Tokenizer
import numpy as np

In [5]:
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

tokenizer=Tokenizer(num_words=1000)

tokenizer.fit_on_texts(samples)

# sentence as sequence on numbers
sequences=tokenizer.texts_to_sequences(samples)
print(sequences)

# directly one hot encode the sentences(gives 2*1000 ndarray)
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
one_hot_results[1]

#getting the word_index
word_index=tokenizer.word_index
word_index

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]


{'the': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'mat': 5,
 'dog': 6,
 'ate': 7,
 'my': 8,
 'homework': 9}

## one-hot hashing trick

A variant of one-hot encoding is the so-called one-hot hashing trick, which you can use
when the number of unique tokens in your vocabulary is too large to handle explicitly.
Instead of explicitly assigning an index to each word and keeping a reference of these
indices in a dictionary, you can hash words into vectors of fixed size. This is typically
done with a very lightweight hashing function. The main advantage of this method is
that it does away with maintaining an explicit word index, which saves memory and
allows online encoding of the data (you can generate token vectors right away, before
you’ve seen all of the available data). The one drawback of this approach is that it’s
susceptible to hash collisions: two different words may end up with the same hash, and
subsequently any machine-learning model looking at these hashes won’t be able to tell
the difference between these words. The likelihood of hash collisions decreases when
the dimensionality of the hashing space is much larger than the total number of
unique tokens being hashed.

In [6]:
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
dimensionality=1000
max_length=10
results=np.zeros((len(samples), max_length, dimensionality ))
for i,sentence in enumerate(samples):
    for j, word in enumerate(sentence[:max_length]):
        word_hash=abs(hash(word))%dimensionality
        results[i, j, word_hash] = 1.
results

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

## using word embeddings

Another popular and powerful way to associate a vector with a word is the use of dense
word vectors, also called word embeddings. Whereas the vectors obtained through one-hot
encoding are binary, sparse (mostly made of zeros), and very high-dimensional (same
dimensionality as the number of words in the vocabulary), word embeddings are lowdimensional
floating-point vectors (that is, dense vectors, as opposed to sparse vectors);
see figure 6.2. Unlike the word vectors obtained via one-hot encoding, word
embeddings are learned from data. It’s common to see word embeddings that are
256-dimensional, 512-dimensional, or 1,024-dimensional when dealing with very large
vocabularies. On the other hand, one-hot encoding words generally leads to vectors
that are 20,000-dimensional or greater (capturing a vocabulary of 20,000 tokens, in
this case). So, word embeddings pack more information into far fewer dimensions.

### Learn word embeddings jointly with the main task
Learn word embeddings jointly with the main task you care about (such as document
classification or sentiment prediction). In this setup, you start with random
word vectors and then learn word vectors in the same way you learn the
weights of a neural network.

#### Instantiating an Embedding layer
The Embedding layer is best understood as a dictionary that maps integer indices
(which stand for specific words) to dense vectors. It takes integers as input, it looks up
these integers in an internal dictionary, and it returns the associated vectors. It’s effectively
a dictionary lookup (see figure 6.4).

In [7]:
from keras.layers import Embedding
"""
The Embedding layer takes at least two
arguments: the number of possible tokens
(here, 1,000: 1 + maximum word index)
and the dimensionality of the embeddings
(here, 64).
"""
embedding_layer = Embedding(1000, 64)
embedding_layer


<keras.layers.embeddings.Embedding at 0x642946198>

The Embedding layer takes as input a 2D tensor of integers, of shape (samples,
sequence_length), where each entry is a sequence of integers. It can embed
sequences of variable lengths: for instance, you could feed into the Embedding layer in
the previous example batches with shapes (32, 10) (batch of 32 sequences of length
10) or (64, 15) (batch of 64 sequences of length 15). All sequences in a batch must
have the same length, though (because you need to pack them into a single tensor),
so sequences that are shorter than others should be padded with zeros, and sequences
that are longer should be truncated.
This layer returns a 3D floating-point tensor of shape (samples, sequence_
length, embedding_dimensionality). Such a 3D tensor can then be processed by
an RNN layer or a 1D convolution layer.

When you instantiate an Embedding layer, its weights (its internal dictionary of
token vectors) are initially random, just as with any other layer. During training, these
word vectors are gradually adjusted via backpropagation

#### Loading the IMDB data for use with an Embedding layer
restrict the
movie reviews to the top 10,000 most common words (as you did the first time you
worked with this dataset) and cut off the reviews after only 20 words. The network will
learn 8-dimensional embeddings for each of the 10,000 words turn the input integer
sequences (2D integer tensor) into embedded sequences (3D float tensor), flatten the
tensor to 2D, and train a single Dense layer on top for classification

In [8]:
from keras.datasets import imdb
from keras import preprocessing

max_features=10000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
# here each sample in x_train is list of integers(with length equal to no. of words in it.(only words in top 10k are included though))
#x_train

In [9]:
x_train

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1153, 194, 8255, 78, 228,

In [10]:
max_len=20
x_train=preprocessing.sequence.pad_sequences(x_train, maxlen=max_len)
x_test=preprocessing.sequence.pad_sequences(x_test, maxlen=max_len)

# making each review of length 20, by picking first 20 words only and if less that 20 words, adding padding chars.

x_train

array([[  65,   16,   38, ...,   19,  178,   32],
       [  23,    4, 1690, ...,   16,  145,   95],
       [1352,   13,  191, ...,    7,  129,  113],
       ...,
       [  11, 1818, 7561, ...,    4, 3586,    2],
       [  92,  401,  728, ...,   12,    9,   23],
       [ 764,   40,    4, ...,  204,  131,    9]], dtype=int32)

#### Using an Embedding layer and classifier on the IMDB data

In [11]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten

model=Sequential()
model.add(Embedding(10000, output_dim=8, input_length=max_len)) # turns 2d tensor into a 3d tensor of shape(samples, maxlen, 8). 8 is dimensionality
# specified 10k so that it is convenient to flatten the o/p to 2d matrx later
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [12]:
history = model.fit(x_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
model.evaluate(x_test, y_test)



[0.5215603371810913, 0.7535600066184998]

### pretrained word embeddings.
Load into your model word embeddings that were precomputed using a different
machine-learning task than the one you’re trying to solve.

You’ll use a model similar to the one we just went over: embedding sentences in
sequences of vectors, flattening them, and training a Dense layer on top. But you’ll do
so using pretrained word embeddings; and instead of using the pretokenized IMDB
data packaged in Keras, you’ll start from scratch by downloading the original text data.

#### Processing the labels of the raw IMDB data


In [32]:
import os
imdb_dir="data/aclImdb"
train_dir=os.path.join(imdb_dir, "train")
test_dir=os.path.join(imdb_dir, "test")

labels=[]
texts=[]

for label_type in ['neg', 'pos']:
    dir_name=os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname.endswith(".txt"):
            with open(os.path.join(dir_name, fname), 'r') as fp:
                texts.append(fp.read())
            if label_type=='neg':
                labels.append(0)
            else:
                labels.append(1)
#labels, texts

#### Tokenizing the text of the raw IMDB data

Let’s vectorize the text and prepare a training and validation split, using the concepts
introduced earlier in this section. Because pretrained word embeddings are meant to
be particularly useful on problems where little training data is available (otherwise,
task-specific embeddings are likely to outperform them), we’ll add the following twist:
restricting the training data to the first 200 samples. So you’ll learn to classify movie
reviews after looking at just 200 examples.

In [33]:
from keras.preprocessing import text
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

max_len=100 # cuts off reviews after 100 words
training_samples=200
validation_samples=10000
max_words=10000

tokenizer=Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

sequences=tokenizer.texts_to_sequences(texts)

word_index=tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

Found 88582 unique tokens.


In [34]:
data = pad_sequences(sequences, max_len)
labels=np.asarray(labels)

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (25000, 100)
Shape of label tensor: (25000,)


In [35]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

#### Download and parse glove word encodeings
learn embeddings from glove database
it is a database of 2014 wikipedia articles of more that few 100 thousand articles, 400k word vectors and 100 dimensions

In [36]:
glove_dir="data/glove.6B"
embeddings_index = {}
with open(os.path.join(glove_dir, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


#### Preparing the GloVe word-embeddings matrix
build an embedding matrix that you can load into an Embedding layer. It
must be a matrix of shape (max_words, embedding_dim), where each entry i contains
the embedding_dim-dimensional vector for the word of index i in the reference word
index (built during tokenization). Note that index 0 isn’t supposed to stand for any
word or token—it’s a placeholder.

In [19]:
#embeddings_index['the']

In [37]:
embedding_dim=100
embedding_matrix=np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i<max_words:
        try:
            embedding_vector=embeddings_index[word]
        except:
            #print(word)
            continue
        if embedding_vector is not None:
            embedding_matrix[i]=embedding_vector
embedding_matrix
    

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       [-0.071953  ,  0.23127   ,  0.023731  , ..., -0.71894997,
         0.86894   ,  0.19539   ],
       ...,
       [-0.44036001,  0.31821999,  0.10778   , ..., -1.29849994,
         0.11824   ,  0.64845002],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.54539001, -0.31817999, -0.016281  , ..., -0.44865   ,
         0.067047  ,  0.17975999]])

#### Defining the model

In [38]:
# using same model architecture s before: embedding->flatten->dense
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model=Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_4 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_6 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


#### setting weights of embedding layer

The Embedding layer has a single weight matrix: a 2D float matrix where each entry i is
the word vector meant to be associated with index i. Simple enough. Load the GloVe
matrix you prepared into the Embedding layer, the first layer in the model.

In [39]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable=False
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_4 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_6 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 320,065
Non-trainable params: 1,000,000
_________________________________________________________________


#### compile and train and see the performance!

In [40]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history=model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=32)
model.save_weights('pre_trained_glove_model.h5')

Train on 200 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
# this is causing kernel to die
"""
import matplotlib.pyplot as plt
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.figure()
plt.show()
"""

"\nimport matplotlib.pyplot as plt\nacc = history.history['acc']\nval_acc = history.history['val_acc']\nloss = history.history['loss']\nval_loss = history.history['val_loss']\nepochs = range(1, len(acc) + 1)\nplt.plot(epochs, acc, 'bo', label='Training acc')\nplt.plot(epochs, val_acc, 'b', label='Validation acc')\nplt.title('Training and validation accuracy')\nplt.legend()\nplt.figure()\nplt.plot(epochs, loss, 'bo', label='Training loss')\nplt.plot(epochs, val_loss, 'b', label='Validation loss')\nplt.title('Training and validation loss')\nplt.legend()\nplt.figure()\nplt.show()\n"

In [25]:
#plt.rcdefaults()


### Training the same model without pretrained word embeddings

This will give much lower validation accuracy if number of sample is still lower, will increase if train samples increase.

In [27]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history = model.fit(x_train, y_train,epochs=10,batch_size=32,validation_data=(x_val, y_val))

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_3 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 200 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Evaluate model on test data

In [44]:
test_dir = os.path.join(imdb_dir, 'test')
labels = []
texts = []
for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in sorted(os.listdir(dir_name)):
        if fname[-4:] == '.txt':
            with open(os.path.join(dir_name, fname), 'r') as f:
                texts.append(f.read())
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
sequences = tokenizer.texts_to_sequences(texts)
x_test = pad_sequences(sequences, maxlen=max_len)
y_test = np.asarray(labels)

In [46]:
model.load_weights('pre_trained_glove_model.h5')
model.evaluate(x_test, y_test)



[0.7854385869979859, 0.5796800255775452]