In [1]:
# Importing Libraries And Modules
import os
import numpy as np
import tensorflow as tf                                                 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding

##### Processing The Labels of The Raw IMDB Data

In [2]:
imdb_dir = "/Users/huzaifa/Documents/Datasets/aclImdb"

In [3]:
train_dir = os.path.join(imdb_dir, 'train')

In [4]:
labels = []
texts = []

In [5]:
for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

##### Tokenizing The Text of The Raw IMDB Data

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [7]:
max_len = 100
training_samples = 200
validation_samples = 10000
max_words = 10000

In [8]:
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [9]:
words_index = tokenizer.word_index
print('Found %s unique tokens.' % len(words_index))

Found 88582 unique tokens.


In [10]:
data = pad_sequences(sequences, maxlen = max_len)

In [11]:
labels = np.asarray(labels)
print("Shape of data tensor: ", data.shape)
print("Shape of label tensor: ", labels.shape)

Shape of data tensor:  (25000, 100)
Shape of label tensor:  (25000,)


In [12]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [13]:
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

##### Prepairing The GLoVE Word Embeddings File

In [15]:
glove_dir = '/Users/huzaifa/Documents/Datasets/glove'

In [16]:
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs
f.close()

In [17]:
print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


##### Prepairing The GLoVE Word Embeddings Matrix

In [19]:
embedding_dim = 100

In [21]:
embedding_matrix = np.zeros((max_words, embedding_dim))

In [27]:
for word, i in words_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

##### Model Definition

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

In [30]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length = max_len))
model.add(Flatten())
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1000000   
                                                                 
 flatten (Flatten)           (None, 10000)             0         
                                                                 
 dense (Dense)               (None, 32)                320032    
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________
