In [1]:
import string
from os import listdir
import re
from collections import Counter
from nltk.corpus import stopwords

## Create Vocabulary

In [2]:
def load_doc(filename):
    file=open(filename,'r')
    text=file.read()
    file.close()
    return text

def clean_doc(doc):
    tokens=doc.split()
    re_punc= re.compile('[%s]' % re.escape(string.punctuation))
    tokens=[re_punc.sub('', w) for w in tokens]
    tokens=[word for word in tokens if word.isalpha()]
    stop_words=set(stopwords.words('english'))
    tokens=[w for w in tokens if not w in stop_words]
    tokens=[word for word in tokens if len(word)> 1]
    return tokens

def add_doc_to_vocab(filename,vocab):
    doc=load_doc(filename)
    tokens=clean_doc(doc)
    vocab.update(tokens)

def process_docs(directory,vocab):
    for filename in listdir(directory):
        if not filename.endswith(".txt"):
            next
        path=directory+"/"+filename
        add_doc_to_vocab(path,vocab)

def save_list(lines, filename):
    data= '\n'.join(lines)
    file=open(filename,'w')
    file.write(data)
    file.close()

In [3]:
vocab=Counter()
process_docs('dataset/movie_review/neg', vocab)
print(len(vocab))
process_docs('dataset/movie_review/pos', vocab)
print(len(vocab))
min_occurance=2
tokens= [ k for k,c in vocab.items() if c >= min_occurance]
print(len(tokens))

save_list(tokens,'vocab.txt')

32010
46557
27139


## Use vocabulary to prepare dataset

In [4]:
def doc_to_line(filename, vocab):
    doc= load_doc(filename)
    tokens=clean_doc(doc)
    tokens=[w for w in tokens if w in vocab]
    return ' '.join(tokens) 

def process_docs(directory,vocab):
    lines=[]
    for filename in listdir(directory):
        if not filename.endswith(".txt"):
            next
        path=directory+"/"+filename
        line=doc_to_line(path, vocab)
        lines.append(line)
    return lines

In [5]:
vocab_filename='vocab.txt'
vocab=load_doc(vocab_filename)
vocab=vocab.split()
vocab=set(vocab)
negative_lines=process_docs('dataset/movie_review/neg', vocab)
save_list(negative_lines, 'negative.txt')
positive_lines=process_docs('dataset/movie_review/pos', vocab)
save_list(positive_lines, 'positive.txt')

## ML part :))

In [6]:
from keras.preprocessing.text import Tokenizer

def load_clean_dataset(vocab, isTrain):
    neg=process_docs('dataset/movie_review/neg', vocab, isTrain)
    pos=process_docs('dataset/movie_review/pos', vocab, isTrain)
    docs=neg+pos
    labels= [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels

def process_docs(directory,vocab, isTrain):
    lines=[]
    for filename in listdir(directory):
        if not filename.endswith(".txt"):
            next
        if isTrain and filename.startswith('cv9'):
            continue
        if not isTrain and not filename.startswith('cv9'):
            continue
        path=directory+"/"+filename
        line=doc_to_line(path, vocab)
        lines.append(line)
    return lines

def create_tokenizer(lines):
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer




In [7]:
train_docs, ytrain=load_clean_dataset(vocab, True)
test_docs, ytest=load_clean_dataset(vocab, False)
tokenizer=create_tokenizer(train_docs)
Xtrain=tokenizer.texts_to_matrix(train_docs, mode='freq')
Xtest =tokenizer.texts_to_matrix(test_docs, mode='freq')

In [8]:
# from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense

n_words=Xtest.shape[1]

def define_model(n_words):
    model=Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    # plot_model(model,to_file='model.png',show_shape=True)
    return model

import numpy as np
y_train = np.array(ytrain)
y_test = np.array(ytest)


In [9]:
model=define_model(n_words)
model.fit(Xtrain, y_train, epochs=10, verbose=2)
loss, acc= model.evaluate(Xtest, y_test, verbose=0)
print(f"Test accuracy {acc * 100}")



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                1344900   
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 1344951 (5.13 MB)
Trainable params: 1344951 (5.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10


57/57 - 1s - loss: 0.6919 - accuracy: 0.5994 - 1s/epoch - 24ms/step
Epoch 2/10
57/57 - 1s - loss: 0.6833 - accuracy: 0.7267 - 618ms/epoch - 11ms/step
Epoch 3/10
57/57 - 1s - loss: 0.6655 - accuracy: 0.8539 - 636ms/epoch - 11ms/step
Epoch 4/10
57/57 - 1s - loss: 0.6372 - accuracy: 0.8994 - 610ms/epoch - 11ms/step
Epoch 5/10
57/57 - 1s - loss: 0.5997 - accuracy: 0.9283 - 620ms/epoch - 11ms/step
Epo

## Compare scoring methods

In [10]:
def prepare_data(train_docs, test_docs, mode):
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(train_docs)
    Xtrain=tokenizer.texts_to_matrix(train_docs, mode=mode)
    Xtest =tokenizer.texts_to_matrix(test_docs, mode=mode)
    return Xtrain, Xtest

def evaluate_mode(Xtrain, ytrain, Xtest, ytest):
    scores=[]
    n_repeats=30
    n_words=Xtrain.shape[1]
    for i in range(n_repeats):
        model=define_model(n_words)
        model.fit(Xtrain, y_train, epochs=10, verbose=2)
        loss, acc= model.evaluate(Xtest, y_test, verbose=0)
        scores.append(acc)
    return scores

In [11]:
from pandas import DataFrame
modes=['binary','count','tfidf', 'freq']
results=DataFrame()
for mode in modes:
    Xtrain, Xtest= prepare_data(train_docs, test_docs, mode)
    results[mode]=evaluate_mode(Xtrain,y_train, Xtest, y_test)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 50)                1344900   
                                                                 
 dense_3 (Dense)             (None, 1)                 51        
                                                                 
Total params: 1344951 (5.13 MB)
Trainable params: 1344951 (5.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
57/57 - 1s - loss: 0.4699 - accuracy: 0.7894 - 1s/epoch - 21ms/step
Epoch 2/10
57/57 - 1s - loss: 0.0645 - accuracy: 0.9911 - 640ms/epoch - 11ms/step
Epoch 3/10
57/57 - 1s - loss: 0.0151 - accuracy: 1.0000 - 654ms/epoch - 11ms/step
Epoch 4/10
57/57 - 1s - loss: 0.0073 - accuracy: 1.0000 - 699ms/epoch - 12ms/step
Epoch 5/10
57/57 - 1s - loss: 0.0042 - accuracy: 1.0000 - 667ms/epoch - 12ms/step
Epoch

In [12]:
print(results.describe())


          binary      count      tfidf      freq
count  30.000000  30.000000  30.000000  30.00000
mean    0.922333   0.894833   0.873333   0.86500
std     0.009714   0.010544   0.011090   0.02205
min     0.900000   0.865000   0.845000   0.75500
25%     0.915000   0.890000   0.866250   0.86500
50%     0.925000   0.895000   0.875000   0.87000
75%     0.930000   0.900000   0.880000   0.87500
max     0.940000   0.915000   0.890000   0.88000


In [14]:
def predict_sentiment(review, vocab, tokenizer, model):
    tokens= clean_doc(review)
    tokens=[w  for w in tokens if w in vocab]
    line=' '.join(tokens)
    encoded=tokenizer.texts_to_matrix([line],mode='binary')
    yhat=model.predict(encoded, verbose=0)
    return yhat
model=define_model(n_words)
Xtrain, Xtest= prepare_data(train_docs, test_docs, 'binary')
model.fit(Xtrain, y_train, epochs=10, verbose=2)


Model: "sequential_122"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_244 (Dense)           (None, 50)                1344900   
                                                                 
 dense_245 (Dense)           (None, 1)                 51        
                                                                 
Total params: 1344951 (5.13 MB)
Trainable params: 1344951 (5.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
57/57 - 1s - loss: 0.4755 - accuracy: 0.7833 - 1s/epoch - 20ms/step
Epoch 2/10
57/57 - 1s - loss: 0.0591 - accuracy: 0.9956 - 619ms/epoch - 11ms/step
Epoch 3/10
57/57 - 1s - loss: 0.0154 - accuracy: 1.0000 - 634ms/epoch - 11ms/step
Epoch 4/10
57/57 - 1s - loss: 0.0073 - accuracy: 1.0000 - 657ms/epoch - 12ms/step
Epoch 5/10
57/57 - 1s - loss: 0.0040 - accuracy: 1.0000 - 652ms/epoch - 11ms/step
Epo

<keras.src.callbacks.History at 0x1d547ad2c20>

In [15]:

text=' Best movie ever! Pretty good movie'
predict_sentiment(text, vocab, tokenizer, model)

array([[0.5110105]], dtype=float32)