In [12]:
import string
from os import listdir
import re
from collections import Counter
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense,Flatten, Embedding, Input, Dropout
from keras.layers import Conv1D, MaxPool1D
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from keras.layers import concatenate
from keras.models import Model
import numpy as np

In [9]:
def prepare_data(train_docs, test_docs, mode):
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(train_docs)
    Xtrain=tokenizer.texts_to_matrix(train_docs, mode=mode)
    Xtest =tokenizer.texts_to_matrix(test_docs, mode=mode)
    return Xtrain, Xtest

def load_doc(filename):
    file=open(filename,'r')
    text=file.read()
    file.close()
    return text

def clean_doc(doc, vocab):
    tokens=doc.split()
    re_punc= re.compile('[%s]' % re.escape(string.punctuation))
    tokens=[re_punc.sub('', w) for w in tokens]
    tokens=[word for word in tokens if word.isalpha()]
    stop_words=set(stopwords.words('english'))
    tokens=[w for w in tokens if not w in stop_words]
    tokens=[word for word in tokens if len(word)> 1]
    return tokens

def doc_to_line(filename, vocab):
    doc= load_doc(filename)
    tokens=clean_doc(doc,vocab)
    tokens=[w for w in tokens if w in vocab]
    return ' '.join(tokens) 


def process_docs(directory,vocab, isTrain):
    lines=[]
    for filename in listdir(directory):
        if not filename.endswith(".txt"):
            next
        if isTrain and filename.startswith('cv9'):
            continue
        if not isTrain and not filename.startswith('cv9'):
            continue
        path=directory+"/"+filename
        line=doc_to_line(path, vocab)
        lines.append(line)
    return lines

def add_doc_to_vocab(filename,vocab):
    doc=load_doc(filename)
    tokens=clean_doc(doc, vocab)
    vocab.update(tokens)

def load_clean_dataset(vocab, isTrain):
    neg=process_docs('dataset/movie_review/neg', vocab, isTrain)
    pos=process_docs('dataset/movie_review/pos', vocab, isTrain)
    docs=neg+pos
    labels= [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels

def create_tokenizer(lines):
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [16]:
def define_model(vocab_size, max_length):
    inputs1= Input(shape=(max_length,))
    embedding1= Embedding(vocab_size,100)(inputs1)
    conv1= Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1= Dropout(0.5)(conv1)
    pool1= MaxPool1D(pool_size=2)(drop1)
    flat1= Flatten()(pool1)

    inputs2= Input(shape=(max_length,))
    embedding2= Embedding(vocab_size,100)(inputs2)
    conv2= Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2= Dropout(0.5)(conv2)
    pool2= MaxPool1D(pool_size=2)(drop2)
    flat2= Flatten()(pool2)

    inputs3= Input(shape=(max_length,))
    embedding3= Embedding(vocab_size,100)(inputs3)
    conv3= Conv1D(filters=32, kernel_size=6, activation='relu')(embedding3)
    drop3= Dropout(0.5)(conv3)
    pool3= MaxPool1D(pool_size=2)(drop3)
    flat3= Flatten()(pool3)

    merged= concatenate([flat1, flat2, flat3])
    dense1 = Dense(10, activation='relu')(merged)
    outputs= Dense(1, activation='sigmoid')(dense1)
    model= Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model


def evaluate_mode(Xtrain, ytrain, Xtest, ytest):
    scores=[]
    n_repeats=30
    n_words=Xtrain.shape[1]
    for i in range(n_repeats):
        model=define_model(n_words)
        model.fit(Xtrain, ytrain, epochs=10, verbose=2)
        loss, acc= model.evaluate(Xtest, ytest, verbose=0)
        scores.append(acc)
    return scores

def encode_docs(tokenizer, max_length, docs):
    encoded= tokenizer.texts_to_sequences(docs)
    padded= pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded

In [10]:
vocab_filename='vocab.txt'
vocab=load_doc(vocab_filename)
vocab=set(vocab.split())
train_docs, y_train= load_clean_dataset(vocab, True)
tokenizer= create_tokenizer(train_docs)
vocab_size=len(tokenizer.word_index) + 1
print(vocab_size)
y_train=np.asarray(y_train)

26897


In [17]:
max_length= max([len(s.split()) for s in train_docs]) 
max_length
Xtrain= encode_docs(tokenizer, max_length, train_docs)
model= define_model(vocab_size, max_length)
model.fit([Xtrain, Xtrain, Xtrain], np.asarray(y_train), epochs=10, verbose=2)
model.save('model.h5')


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_10 (InputLayer)       [(None, 1319)]               0         []                            
                                                                                                  
 input_11 (InputLayer)       [(None, 1319)]               0         []                            
                                                                                                  
 input_12 (InputLayer)       [(None, 1319)]               0         []                            
                                                                                                  
 embedding_9 (Embedding)     (None, 1319, 100)            2689700   ['input_10[0][0]']            
                                                                                           

  saving_api.save_model(


# Evaluation

In [18]:
train_docs, y_train= load_clean_dataset(vocab, True)
test_docs, y_test= load_clean_dataset(vocab, False)
tokenizer=create_tokenizer(train_docs)
vocab_size=len(tokenizer.word_index) +1 
max_length= max([len(s.split()) for s in train_docs]) 
Xtrain= encode_docs(tokenizer, max_length, train_docs)
Xtest= encode_docs(tokenizer, max_length, test_docs)
y_train=np.asarray(y_train)
y_test=np.asarray(y_test)

In [20]:
model=load_model('model.h5')
_, acc= model.evaluate([Xtrain,Xtrain, Xtrain], y_train, verbose=0)
print(f"Train accuracy {acc}")
_, acc= model.evaluate([Xtest,Xtest,Xtest], y_test, verbose=0)
print(f"Test accuracy {acc}")


Train accuracy 0.9972222447395325
Test accuracy 0.8700000047683716


In [24]:
def predict_sentiment(review, vocab, tokenizer, max_length, model):
    line= clean_doc(review, vocab)
    padded= encode_docs(tokenizer, max_length, [line])
    yhat=model.predict([padded,padded,padded], verbose=0)
    percent_pos= yhat[0,0]
    if round(percent_pos)==0:
        return (1- percent_pos), "NEGATIVE"
    return percent_pos, "POSITIVE"

def clean_doc(doc, vocab):
    tokens=doc.split()
    re_punc= re.compile('[%s]' % re.escape(string.punctuation))
    tokens=[re_punc.sub('', w) for w in tokens]
    tokens=[word for word in tokens if word in vocab]
    tokens= ' '.join(tokens)
    return tokens


In [25]:
text= "Really bad movie"
predict_sentiment(text, vocab, tokenizer, max_length, model)

(0.6401509642601013, 'NEGATIVE')