In [27]:
import string
from os import listdir
import re
from collections import Counter
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense,Flatten, Embedding
from keras.layers import Conv1D, MaxPool1D
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [10]:
def prepare_data(train_docs, test_docs, mode):
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(train_docs)
    Xtrain=tokenizer.texts_to_matrix(train_docs, mode=mode)
    Xtest =tokenizer.texts_to_matrix(test_docs, mode=mode)
    return Xtrain, Xtest

def load_doc(filename):
    file=open(filename,'r')
    text=file.read()
    file.close()
    return text

def clean_doc(doc, vocab):
    tokens=doc.split()
    re_punc= re.compile('[%s]' % re.escape(string.punctuation))
    tokens=[re_punc.sub('', w) for w in tokens]
    tokens=[word for word in tokens if word.isalpha()]
    stop_words=set(stopwords.words('english'))
    tokens=[w for w in tokens if not w in stop_words]
    tokens=[word for word in tokens if len(word)> 1]
    return tokens

def doc_to_line(filename, vocab):
    doc= load_doc(filename)
    tokens=clean_doc(doc)
    tokens=[w for w in tokens if w in vocab]
    return ' '.join(tokens) 


def process_docs(directory,vocab, isTrain):
    lines=[]
    for filename in listdir(directory):
        if not filename.endswith(".txt"):
            next
        if isTrain and filename.startswith('cv9'):
            continue
        if not isTrain and not filename.startswith('cv9'):
            continue
        path=directory+"/"+filename
        line=doc_to_line(path, vocab)
        lines.append(line)
    return lines

def add_doc_to_vocab(filename,vocab):
    doc=load_doc(filename)
    tokens=clean_doc(doc)
    vocab.update(tokens)

def load_clean_dataset(vocab, isTrain):
    neg=process_docs('dataset/movie_review/neg', vocab, isTrain)
    pos=process_docs('dataset/movie_review/pos', vocab, isTrain)
    docs=neg+pos
    labels= [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels

def create_tokenizer(lines):
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [18]:
def define_model(vocab_size, max_length):
    model=Sequential()
    model.add(Embedding(vocab_size,100, input_length=max_length))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPool1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    # plot_model(model,to_file='model.png',show_shape=True)
    return model

def evaluate_mode(Xtrain, ytrain, Xtest, ytest):
    scores=[]
    n_repeats=30
    n_words=Xtrain.shape[1]
    for i in range(n_repeats):
        model=define_model(n_words)
        model.fit(Xtrain, ytrain, epochs=10, verbose=2)
        loss, acc= model.evaluate(Xtest, ytest, verbose=0)
        scores.append(acc)
    return scores

def encode_docs(tokenizer, max_length, docs):
    encoded= tokenizer.texts_to_sequences(docs)
    padded= pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded

In [11]:
vocab_filename='vocab.txt'
vocab=load_doc(vocab_filename)
vocab=set(vocab.split())
train_docs, y_train= load_clean_dataset(vocab, True)
tokenizer= create_tokenizer(train_docs)
vocab_size=len(tokenizer.word_index) + 1
print(vocab_size)
y_train=np.asarray(y_train)

26897


In [26]:
max_length= max([len(s.split()) for s in train_docs]) 
max_length
Xtrain= encode_docs(tokenizer, max_length, train_docs)
model= define_model(vocab_size, max_length)
model.fit(Xtrain, np.asarray(y_train), epochs=10, verbose=2)
model.save('model.h5')

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1319, 100)         2689700   
                                                                 
 conv1d_1 (Conv1D)           (None, 1312, 32)          25632     
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 656, 32)           0         
 g1D)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 20992)             0         
                                                                 
 dense_2 (Dense)             (None, 10)                209930    
                                                                 
 dense_3 (Dense)             (None, 1)                 11        
                                                      

  saving_api.save_model(


# Evaluation

In [30]:
train_docs, y_train= load_clean_dataset(vocab, True)
test_docs, y_test= load_clean_dataset(vocab, False)
tokenizer=create_tokenizer(train_docs)
vocab_size=len(tokenizer.word_index) +1 
max_length= max([len(s.split()) for s in train_docs]) 
Xtrain= encode_docs(tokenizer, max_length, train_docs)
Xtest= encode_docs(tokenizer, max_length, test_docs)
y_train=np.asarray(y_train)
y_test=np.asarray(y_test)

In [31]:
model=load_model('model.h5')
_, acc= model.evaluate(Xtrain, y_train, verbose=0)
print(f"Train accuracy {acc}")
_, acc= model.evaluate(Xtest, y_test, verbose=0)
print(f"Test accuracy {acc}")


Train accuracy 1.0
Test accuracy 0.8799999952316284


In [36]:
def predict_sentiment(review, vocab, tokenizer, max_length, model):
    line= clean_doc(review, vocab)
    padded= encode_docs(tokenizer, max_length, [line])
    yhat=model.predict(padded, verbose=0)
    percent_pos= yhat[0,0]
    if round(percent_pos)==0:
        return (1- percent_pos), "NEGATIVE"
    return percent_pos, "POSITIVE"

def clean_doc(doc, vocab):
    tokens=doc.split()
    re_punc= re.compile('[%s]' % re.escape(string.punctuation))
    tokens=[re_punc.sub('', w) for w in tokens]
    tokens=[word for word in tokens if word in vocab]
    tokens= ' '.join(tokens)
    return tokens


In [37]:
text= "Really bad movie"
predict_sentiment(text, vocab, tokenizer, max_length, model)

(0.9917069831863046, 'NEGATIVE')