# Word embeddings for IMDB data
    + here we use pretrained embeddings and then fine tune them

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
#load data
from keras.datasets import imdb

num_of_words = 1000

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=num_of_words)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [None]:
import os

In [None]:
data_folder = 'D:\Warehouse\imdb'
train_dir = os.path.join(data_folder, 'train')
#print(train_dir)

labels = []
texts = []

for label_type in ['neg', 'pos']:
    data_folder = os.path.join(train_dir, label_type)
    #print(data_folder)
    
    for file_name in os.listdir(data_folder):
        print(file_name)
        
        if (file_name[-4:] == '.txt'):
            
            file_path = os.path.join(data_folder, file_name)
            print(file_path)
            
            with open(file_path, 'r', encoding="utf8") as infile:
                data = infile.read()
                #print(data)
            
            texts.append(data)
            
            #f = open(file_path)    
            #texts.append(f.read())
            #f.close()
            
            if (label_type == 'neg'):
                labels.append(0)
            else:
                labels.append(1)
                


In [None]:
print(len(labels))

import numpy as np
print(np.bincount(labels))


# Tokenize data

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)

In [None]:
word_to_id_mapping = tokenizer.word_index
print('Found %s unique tokens.' % len(word_to_id_mapping))

In [None]:
maxlen = 100  # We will cut reviews after 100 words
training_samples = 200  # We will be training on 200 samples
validation_samples = 10000  # We will be validating on 10000 samples

data = pad_sequences(sequences, maxlen=maxlen)

In [None]:
print(data.shape)
print(data[0])

In [None]:
len(labels)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# Pre-process the embeddings

In [None]:
glove_dir = 'D:\Warehouse\Embeddings\glove.6B'

embeddings_index = {}

f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), 'r', encoding="utf8")

for line in f:
    
    #print(line)
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    
    embeddings_index[word] = coefs
    
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
type(embeddings_index['the'])
print(embeddings_index['the'].shape)

In [None]:
embedding_dim = embeddings_index['the'].shape[0]
embedding_matrix = np.zeros((max_words, embedding_dim))

for i, (word, vector) in enumerate(embeddings_index.items()):
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

In [None]:
#embedding_matrix[0]

# Define a model

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
# Load the GloVe embeddings in the model

model.layers[0].set_weights([embedding_matrix])


#model.layers[0].trainable = False

## Train and evaluate

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [None]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

In [None]:
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()