# Project By Varun Gehlot

## Objective:

### To Develop a MLP Neural network Bag-of-Words Model for Movie Reviews Sentiment Analysis

In [1]:

from os import listdir
import numpy as np
from nltk.corpus import stopwords
from string import punctuation
from collections import Counter
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense

def load_doc(filename):
    with open(filename,'r') as file:
        text = file.read()
    return text

def clean_doc(doc):
    #  tokenization, puncts, stopwowds, only alphas, filter short words > 1
    en_stopWords = stopwords.words('english')
    tokens = doc.split()
    no_punct_sw = [w for w in tokens if w not in punctuation and w not in en_stopWords]
    only_alpha = [w for w in no_punct_sw if w.isalpha()]
    clean_doc = [w for w in only_alpha if len(w) > 1]
    return clean_doc

'''
def add_doc_to_vocab(filename,vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    # update() updates the Counter dictionary
    vocab.update(tokens)
'''   
# loading, cleaning, filtering out tokens not in the vocabulary, then returning the document as a string of white space separated tokens 
def doc_to_line(filename,vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)
    
def proccess_doc(directory,vocab, is_train):
    lines = []
    for filename in listdir(directory):
        # skipping 10% test data that is from cv900 to cv999
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
    # creating a full path of file to open
        path = directory + '/' + filename
    # adding doc to vocabulary
        ''' add_doc_to_vocab(path,vocab) '''
        line = doc_to_line(path,vocab)
        lines.append(line)
    return lines

# loads documents and labels them 0 and 1
def load_clean_dataset(vocab, is_train):
    pos = proccess_doc('txt_sentoken/neg', vocab, is_train)
    neg = proccess_doc('txt_sentoken/pos', vocab, is_train)
    docs = neg + pos
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels
    
def save_file(lines,filename):
    # saving each token in new line 
    data = '\n'.join(lines)
    with open(filename,'w') as file:
        file.write(data)
    
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
    
    
def define_model(n_words):
    # defining MLP network
    model = Sequential() 
    model.add(Dense(50, input_shape = (n_words,), activation = 'relu'))
    model.add(Dense(1,activation= 'sigmoid'))
    # compiling network (configuration)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    # summarizing model
    model.summary()
    return model


#vocab = Counter()
# loading vocabulary file
vocab = load_doc('Vocabulary.txt')
vocab = set(vocab.split())

'''
# loading all training reviews
docs, labels = load_clean_dataset(vocab)
'''


# loading all reviews
train_docs, y_train = load_clean_dataset(vocab, True)
test_docs, y_test = load_clean_dataset(vocab, False)

# convert y_train and y_test from list to arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

# creating tokenizer
tokenizer = create_tokenizer(train_docs)

# encoding data
Xtrain = tokenizer.texts_to_matrix(train_docs,mode= 'binary')
Xtest = tokenizer.texts_to_matrix(test_docs, mode= 'binary')

#print(Xtrain.shape, Xtest.shape)

# defining the model
n_words = Xtest.shape[1]
model = define_model(n_words)

# fitting network
model.fit(Xtrain, y_train, epochs = 10, verbose = 1)

# evaluating
loss, acc = model.evaluate(Xtest, y_test, verbose = 0) 
print(f'Accuracy: {round(acc * 100)} %')

'''
# summarize what we have
print(len(docs),len(labels))
'''

'''
# adding all docs to vocab
proccess_doc('txt_sentoken/pos',vocab)
proccess_doc('txt_sentoken/neg',vocab)

print(len(vocab))

# keeeping tokens with minimum occurances
min_occurances = 2
tokens = [w for w,x in vocab.items() if x > min_occurances]
print(len(tokens))

#save_file(tokens,'Vocabulary.txt')
'''




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                905700    
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 905751 (3.46 MB)
Trainable params: 905751 (3.46 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 94 %


"\n# adding all docs to vocab\nproccess_doc('txt_sentoken/pos',vocab)\nproccess_doc('txt_sentoken/neg',vocab)\n\nprint(len(vocab))\n\n# keeeping tokens with minimum occurances\nmin_occurances = 2\ntokens = [w for w,x in vocab.items() if x > min_occurances]\nprint(len(tokens))\n\n#save_file(tokens,'Vocabulary.txt')\n"