## Data used: http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

---
The
dataset is comprised of 1,000 positive and 1,000 negative movie reviews


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import nltk
nltk.download('stopwords')

In [22]:
import string
import re
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate
from pickle import dump, load
from nltk.corpus import stopwords

# Store the Vocabulary

---


In [None]:
# load doc into memory
def load_doc(filename):

  # open the file as read only
  file = open(filename, 'r')

  # read all text
  text = file.read()

  # close the file
  file.close()

  return text


# turn a doc into clean tokens
def clean_doc(doc):
  # split into tokens by white space
  tokens = doc.split()
  # prepare regex for char filtering
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  # remove punctuation from each word
  tokens = [re_punc.sub('', w) for w in tokens]
  # remove remaining tokens that are not alphabetic
  tokens = [word for word in tokens if word.isalpha()]
  # filter out stop words
  stop_words = set(stopwords.words('english'))
  tokens = [w for w in tokens if not w in stop_words]
  # filter out short tokens
  tokens = [word for word in tokens if len(word) > 1]
  return tokens


# load all docs in a directory
def process_docs(directory, is_train):
  documents = list()
  # walk through all files in the folder
  for filename in listdir(directory):
    # skip any reviews in the test set
    if is_train and filename.startswith('cv9'):
      continue
    if not is_train and not filename.startswith('cv9'):
      continue
    # create the full path of the file to open
    path = directory + '/' + filename
    # load the doc
    doc = load_doc(path)
    # clean doc
    tokens = clean_doc(doc)
    # add to list
    documents.append(tokens)
  return documents


# load and clean a dataset
def load_clean_dataset(is_train):
  # load documents
  neg = process_docs("/content/gdrive/My Drive/NLP_Projects/4/txt_sentoken/neg", is_train)
  pos = process_docs("/content/gdrive/My Drive/NLP_Projects/4/txt_sentoken/pos", is_train)
  docs = neg + pos

  # prepare labels
  labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]

  return docs, labels


# save a dataset to file
def save_dataset(dataset, filename):
  dump(dataset, open(filename, 'wb'))
  print('Saved: %s' % filename)


# load and clean all reviews
train_docs, ytrain = load_clean_dataset(True)
test_docs, ytest = load_clean_dataset(False)
# save training datasets
save_dataset([train_docs, ytrain], "/content/gdrive/My Drive/NLP_Projects/4/train.pkl")
save_dataset([test_docs, ytest], "/content/gdrive/My Drive/NLP_Projects/4/test.pkl")

Saved: /content/gdrive/My Drive/NLP_Projects/4/train.pkl
Saved: /content/gdrive/My Drive/NLP_Projects/4/test.pkl


In [None]:
# define vocab
vocab = Counter()

# add all docs to vocab
process_docs("/content/gdrive/My Drive/NLP_Projects/4/txt_sentoken/neg", vocab)
process_docs("/content/gdrive/My Drive/NLP_Projects/4/txt_sentoken/pos", vocab)

# print the size of the vocab
print(len(vocab))

# print the top words in the vocab
print(vocab.most_common(50))

# keep tokens with > 2 occurrence
min_occurence = 2
tokens = [k for k,c in vocab.items() if c >= min_occurence]
print(len(tokens))

# save tokens to a vocabulary file
save_list(tokens, "/content/gdrive/My Drive/NLP_Projects/4/vocab.txt")

46557
[('film', 8860), ('one', 5521), ('movie', 5440), ('like', 3553), ('even', 2555), ('good', 2320), ('time', 2283), ('story', 2118), ('films', 2102), ('would', 2042), ('much', 2024), ('also', 1965), ('characters', 1947), ('get', 1921), ('character', 1906), ('two', 1825), ('first', 1768), ('see', 1730), ('well', 1694), ('way', 1668), ('make', 1590), ('really', 1563), ('little', 1491), ('life', 1472), ('plot', 1451), ('people', 1420), ('movies', 1416), ('could', 1395), ('bad', 1374), ('scene', 1373), ('never', 1364), ('best', 1301), ('new', 1277), ('many', 1268), ('doesnt', 1267), ('man', 1266), ('scenes', 1265), ('dont', 1210), ('know', 1207), ('hes', 1150), ('great', 1141), ('another', 1111), ('love', 1089), ('action', 1078), ('go', 1075), ('us', 1065), ('director', 1056), ('something', 1048), ('end', 1047), ('still', 1038)]
27139


# Training the multi-layered model

---



In [23]:
# load a clean dataset
def load_dataset(filename):
  return load(open(filename, 'rb' ))

# fit a tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer  

# calculate the maximum document length
def max_length(lines):
  return max([len(s) for s in lines])

# encode a list of lines
def encode_text(tokenizer, lines, length):
  # integer encode
  encoded = tokenizer.texts_to_sequences(lines)
  # pad encoded sequences
  padded = pad_sequences(encoded, maxlen=length, padding= 'post' )
  return padded 

# define the model
def define_model(length, vocab_size):
  # channel 1
  inputs1 = Input(shape=(length,))
  embedding1 = Embedding(vocab_size, 100)(inputs1)
  conv1 = Conv1D(filters=32, kernel_size=4, activation= 'relu' )(embedding1)
  drop1 = Dropout(0.5)(conv1)
  pool1 = MaxPooling1D(pool_size=2)(drop1)
  flat1 = Flatten()(pool1)
  # channel 2
  inputs2 = Input(shape=(length,))
  embedding2 = Embedding(vocab_size, 100)(inputs2)
  conv2 = Conv1D(filters=32, kernel_size=6, activation= 'relu' )(embedding2)
  drop2 = Dropout(0.5)(conv2)
  pool2 = MaxPooling1D(pool_size=2)(drop2)
  flat2 = Flatten()(pool2)
  # channel 3
  inputs3 = Input(shape=(length,))
  embedding3 = Embedding(vocab_size, 100)(inputs3)
  conv3 = Conv1D(filters=32, kernel_size=8, activation= 'relu' )(embedding3)
  drop3 = Dropout(0.5)(conv3)
  pool3 = MaxPooling1D(pool_size=2)(drop3)
  flat3 = Flatten()(pool3)
  # merge
  merged = concatenate([flat1, flat2, flat3])
  # interpretation
  dense1 = Dense(10, activation= 'relu' )(merged)
  outputs = Dense(1, activation= 'sigmoid' )(dense1)
  model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
  # compile
  model.compile(loss= 'binary_crossentropy' , optimizer= 'adam' , metrics=[ 'accuracy' ])
  # summarize
  model.summary()
  return model     

In [24]:
# load training dataset
trainLines, trainLabels = load_dataset("/content/gdrive/My Drive/NLP_Projects/4/train.pkl")
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
print( ' Max document length: %d ' % length)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print( ' Vocabulary size: %d ' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
# define model
model = define_model(length, vocab_size)
# fit model
model.fit([trainX,trainX,trainX], trainLabels, epochs=7, batch_size=16)
# save the model
model.save("/content/gdrive/My Drive/NLP_Projects/4/model1.h5")

 Max document length: 1380 
 Vocabulary size: 44277 
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 1380)         0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 1380)         0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            (None, 1380)         0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 1380, 100)    4427700     input_5[0][0]                    
_______________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


# Evaluating the model

---


In [26]:
from keras.models import load_model

trainLines, trainLabels = load_dataset("/content/gdrive/My Drive/NLP_Projects/4/train.pkl")
testLines, testLabels = load_dataset("/content/gdrive/My Drive/NLP_Projects/4/test.pkl")
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
print( ' Max document length: %d ' % length)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print( ' Vocabulary size: %d ' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
testX = encode_text(tokenizer, testLines, length)
# load the model
model = load_model("/content/gdrive/My Drive/NLP_Projects/4/model1.h5")
# evaluate model on training dataset
_, acc = model.evaluate([trainX,trainX,trainX], trainLabels, verbose=0)
print( ' Train Accuracy: %.2f ' % (acc*100))
# evaluate model on test dataset dataset
_, acc = model.evaluate([testX,testX,testX], testLabels, verbose=0)
print( ' Test Accuracy: %.2f ' % (acc*100))

 Max document length: 1380 
 Vocabulary size: 44277 


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


 Train Accuracy: 100.00 
 Test Accuracy: 87.00 
