In [84]:
from os import listdir
import nltk
# nltk.download()
from collections import Counter
from nltk.corpus import stopwords
import string
import re
import tensorflow as tf
import keras
#from keras.preprocessing.text import Tokenizer

Data Preprocessing

In [85]:
# load the doc into memory
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [86]:
# turn a doc into clean tokens
def clean_doc(doc):
    tokens = doc.split()
    
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]

    tokens = [word for word in tokens if word.isalpha()]

    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]

    tokens = [word for word in tokens if len(word)>1]
    return tokens

In [87]:
# save list to a file
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename,'w')
    file.write(data)
    file.close

In [88]:
# add tokens from file to vocab
def add_doc_to_vocab(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    vocab.update(tokens)

In [89]:
# iterates through every file for updating vocab
def process_docs(directory, vocab):
    for filename in listdir(directory):
        # skip any reviews in the test set
        if filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        add_doc_to_vocab(path, vocab)    

In [90]:
vocab = Counter()
# update vocab
process_docs('review_polarity/txt_sentoken/neg', vocab)
process_docs('review_polarity/txt_sentoken/pos', vocab)

min_occurance = 2
tokens = [k for k,c in vocab.items() if c >= min_occurance]

print(len(vocab))
print(len(tokens))
# save tokens in vocab.txt
save_list(tokens, 'vocab.txt')

44276
25767


In [91]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

In [92]:
# load all docs in a directory
def process_docs_2(directory, vocab, is_train):
    lines = list()
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and  not filename.startswith('cv9'):
            continue
        # train(0-899)...test(900-999)
        path = directory + '/' + filename
        line = doc_to_line(path, vocab)
        lines.append(line)
    return lines

In [93]:
def load_clean_dataset(vocab, is_train):
    # prepare negative reviews
    neg = process_docs_2('review_polarity/txt_sentoken/neg', vocab, is_train)
    pos = process_docs_2('review_polarity/txt_sentoken/pos', vocab, is_train)
    docs = neg + pos
    # prepare labels
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))] 
    return docs, labels

In [96]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [98]:
# load vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

# load all reviews
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)

# create the tokenizer
tokenizer = create_tokenizer(train_docs)

# encode data
Xtrain = tokenizer.texts_to_matrix(train_docs, mode='freq')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='freq')
print(Xtrain.shape, Xtest.shape)

(1800, 25768) (200, 25768)
