## Data used: http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

---
The
dataset is comprised of 1,000 positive and 1,000 negative movie reviews


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import nltk
nltk.download('stopwords')

In [0]:
import string
import re
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

# Store the Vocabulary

---


In [0]:
# load doc into memory
def load_doc(filename):

  # open the file as read only
  file = open(filename, 'r')

  # read all text
  text = file.read()

  # close the file
  file.close()

  return text


# turn a doc into clean tokens
def clean_doc(doc):
  # split into tokens by white space
  tokens = doc.split()
  # prepare regex for char filtering
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  # remove punctuation from each word
  tokens = [re_punc.sub('', w) for w in tokens]
  # remove remaining tokens that are not alphabetic
  tokens = [word for word in tokens if word.isalpha()]
  # filter out stop words
  stop_words = set(stopwords.words('english'))
  tokens = [w for w in tokens if not w in stop_words]
  # filter out short tokens
  tokens = [word for word in tokens if len(word) > 1]
  return tokens


# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):

  # load doc
  doc = load_doc(filename)

  # clean doc
  tokens = clean_doc(doc)

  # update counts
  vocab.update(tokens)  


# load all docs in a directory
def process_docs(directory, vocab):
  # walk through all files in the folder
  for filename in listdir(directory):
    # skip any reviews in the test set
    if filename.startswith('cv9'):
      continue
    # create the full path of the file to open
    path = directory + '/' + filename
    # add doc to vocab
    add_doc_to_vocab(path, vocab)


# save list to file
def save_list(lines, filename):
  data = '\n'.join(lines)
  file = open(filename, 'w')
  file.write(data)  
  file.close()   

In [0]:
# define vocab
vocab = Counter()

# add all docs to vocab
process_docs("/content/gdrive/My Drive/NLP_Projects/4/txt_sentoken/neg", vocab)
process_docs("/content/gdrive/My Drive/NLP_Projects/4/txt_sentoken/pos", vocab)

# print the size of the vocab
print(len(vocab))

# print the top words in the vocab
print(vocab.most_common(50))

# keep tokens with > 2 occurrence
min_occurence = 2
tokens = [k for k,c in vocab.items() if c >= min_occurence]
print(len(tokens))

# save tokens to a vocabulary file
save_list(tokens, "/content/gdrive/My Drive/NLP_Projects/4/vocab.txt")

46557
[('film', 8860), ('one', 5521), ('movie', 5440), ('like', 3553), ('even', 2555), ('good', 2320), ('time', 2283), ('story', 2118), ('films', 2102), ('would', 2042), ('much', 2024), ('also', 1965), ('characters', 1947), ('get', 1921), ('character', 1906), ('two', 1825), ('first', 1768), ('see', 1730), ('well', 1694), ('way', 1668), ('make', 1590), ('really', 1563), ('little', 1491), ('life', 1472), ('plot', 1451), ('people', 1420), ('movies', 1416), ('could', 1395), ('bad', 1374), ('scene', 1373), ('never', 1364), ('best', 1301), ('new', 1277), ('many', 1268), ('doesnt', 1267), ('man', 1266), ('scenes', 1265), ('dont', 1210), ('know', 1207), ('hes', 1150), ('great', 1141), ('another', 1111), ('love', 1089), ('action', 1078), ('go', 1075), ('us', 1065), ('director', 1056), ('something', 1048), ('end', 1047), ('still', 1038)]
27139


# Splitting the data as train and test

---



In [0]:
# load all docs in a directory
def process_docs(directory, vocab, is_train):
  documents = list()

  # walk through all files in the folder
  for filename in listdir(directory):
    # skip any reviews in the test set
    if is_train and filename.startswith('cv9'):
      continue
    if not is_train and not filename.startswith('cv9'):
      continue

    # create the full path of the file to open
    path = directory + '/' + filename

    # load the doc
    doc = load_doc(path)

    # clean doc
    tokens = clean_doc(doc, vocab)

    # add to list
    documents.append(tokens)

  return documents


# turn a doc into clean tokens
def clean_doc(doc, vocab):
  # split into tokens by white space
  tokens = doc.split()
  # prepare regex for char filtering
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  # remove punctuation from each word
  tokens = [re_punc.sub('', w) for w in tokens]
  # filter out tokens not in vocab
  tokens = [w for w in tokens if w in vocab]
  tokens = ' '.join(tokens)
  return tokens  


# load and clean a dataset
def load_clean_dataset(vocab, is_train):
  # load documents
  neg = process_docs("/content/gdrive/My Drive/NLP_Projects/4/txt_sentoken/neg", vocab, is_train)
  pos = process_docs("/content/gdrive/My Drive/NLP_Projects/4/txt_sentoken/pos", vocab, is_train)
  docs = neg + pos

  # prepare labels
  labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]

  return docs, labels


# fit a tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer  


# integer encode and pad documents
def encode_docs(tokenizer, max_length, docs):
  # integer encode
  encoded = tokenizer.texts_to_sequences(docs)

  # pad sequences
  padded = pad_sequences(encoded, maxlen=max_length, padding='post')

  return padded   


# define the model
def define_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 100, input_length=max_length))
  model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Flatten())
  model.add(Dense(10, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  # compile network
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  # summarize defined model
  model.summary()
  plot_model(model, to_file='model.png', show_shapes=True)

  return model  

In [16]:
# load the vocabulary
vocab_filename = "/content/gdrive/My Drive/NLP_Projects/4/vocab.txt"
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
# load training data
train_docs, ytrain = load_clean_dataset(vocab, True)
# create the tokenizer
tokenizer = create_tokenizer(train_docs)
# define vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)
# calculate the maximum sequence length
max_length = max([len(s.split()) for s in train_docs])
print('Maximum length: %d' % max_length)
# encode data
Xtrain = encode_docs(tokenizer, max_length, train_docs)
# define model
model = define_model(vocab_size, max_length)
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
# save the model
model.save("/content/gdrive/My Drive/NLP_Projects/4/model.h5")

Vocabulary size: 26897
Maximum length: 1319
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1319, 100)         2689700   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1312, 32)          25632     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 656, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 20992)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                209930    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 2,925,273
Trainable params: 2,925,273
Non-trainable params: 0


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
 - 7s - loss: 0.6935 - accuracy: 0.4972
Epoch 2/10
 - 0s - loss: 0.6347 - accuracy: 0.6406
Epoch 3/10
 - 0s - loss: 0.2011 - accuracy: 0.9511
Epoch 4/10
 - 0s - loss: 0.0118 - accuracy: 0.9989
Epoch 5/10
 - 0s - loss: 0.0048 - accuracy: 0.9989
Epoch 6/10
 - 0s - loss: 0.0031 - accuracy: 0.9994
Epoch 7/10
 - 0s - loss: 0.0025 - accuracy: 0.9994
Epoch 8/10
 - 0s - loss: 0.0019 - accuracy: 0.9994
Epoch 9/10
 - 0s - loss: 0.0012 - accuracy: 0.9994
Epoch 10/10
 - 0s - loss: 6.5587e-04 - accuracy: 1.0000


# Predicting Sentiment for New Reviews and Evaluating the model

---


In [0]:
from keras.models import load_model

# classify a review as negative or positive
def predict_sentiment(review, vocab, tokenizer, max_length, model):
  # clean review
  line = clean_doc(review, vocab)

  # encode and pad review
  padded = encode_docs(tokenizer, max_length, [line])

  # predict sentiment
  yhat = model.predict(padded, verbose=0)

  # retrieve predicted percentage and label
  percent_pos = yhat[0,0]

  if round(percent_pos) == 0:
    return (1-percent_pos), 'NEGATIVE'

  return percent_pos, 'POSITIVE'

In [24]:
test_docs, ytest = load_clean_dataset(vocab, False)
Xtest = encode_docs(tokenizer, max_length, test_docs)

# load the model
model = load_model("/content/gdrive/My Drive/NLP_Projects/4/model.h5")
# evaluate model on training dataset
_, acc = model.evaluate(Xtrain, ytrain, verbose=0)
print('Train Accuracy: %f' % (acc*100))
# evaluate model on test dataset
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

text = 'Everyone will enjoy this film. I love it, recommended!'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))
# test negative text
text1 = 'This is a bad movie. Do not watch it. It sucks.'
percent, sentiment = predict_sentiment(text1, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text1, sentiment, percent*100))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train Accuracy: 100.000000
Test Accuracy: 85.500002
Review: [Everyone will enjoy this film. I love it, recommended!]
Sentiment: POSITIVE (89.759%)
Review: [This is a bad movie. Do not watch it. It sucks.]
Sentiment: POSITIVE (88.574%)
