In [1]:
import string
import re
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

In [2]:
#load doc into memory
def load_doc(filename):
    #open the file as read only
    file=open(filename,"r")
    #read all text
    text=file.read()
    #close the file
    file.close()
    return text

In [3]:
#turn a doc into clean tokens
def clean_doc(doc):
    #split tokens into white spaces
    tokens=doc.split()
    #prepare regex for char filtering
    re_punc=re.compile('[%s]'%re.escape(string.punctuation))
    #remove punctuation from each word
    tokens=[re_punc.sub('',w) for w in tokens]
    #remove remaining tokens that are not alphabetic
    tokens=[word for word in tokens if word.isalpha()]
    #filter out stopwords
    stop_words=set(stopwords.words('english'))
    tokens=[word for word in tokens if not word in stop_words]
    #filter out short tokens
    tokens=[word for word in tokens if len(word)>1]
    return tokens

In [4]:
#load doc and add to vocab
def add_doc_to_vocab(filename,vocab):
    #load doc
    doc=load_doc(filename)
    #clean doc
    tokens=clean_doc(doc)
    #update counts
    vocab.update(tokens)

In [5]:
#load all docs ina directory
def process_docs(directory,vocab):
    #walk through all files in the folder
    for filename in listdir(directory):
        #skip files that do not have the right extension
        if not filename.endswith(".txt"):
            next
        #create the full path
        path=directory+'/'+filename
        add_doc_to_vocab(path,vocab)

This reduces the vocab from 46,557 to 14,803 words, a huge drop.

In [7]:
def save_list(lines,filename):
    data='\n'.join(lines)
    file=open(filename,'w')
    file.write(data)
    file.close()

In [8]:
vocab=Counter()
directory1="data/review_polarity/txt_sentoken/neg"
directory2="data/review_polarity/txt_sentoken/pos"
process_docs(directory1,vocab)
process_docs(directory2,vocab)
print(len(vocab))

46557


Running the example creates a vocabulary witha ll documents in the data set , including positive and negative reviews.


In [9]:
print(vocab.most_common(50))
#most common are film,one,movie

[('film', 8860), ('one', 5521), ('movie', 5440), ('like', 3553), ('even', 2555), ('good', 2320), ('time', 2283), ('story', 2118), ('films', 2102), ('would', 2042), ('much', 2024), ('also', 1965), ('characters', 1947), ('get', 1921), ('character', 1906), ('two', 1825), ('first', 1768), ('see', 1730), ('well', 1694), ('way', 1668), ('make', 1590), ('really', 1563), ('little', 1491), ('life', 1472), ('plot', 1451), ('people', 1420), ('movies', 1416), ('could', 1395), ('bad', 1374), ('scene', 1373), ('never', 1364), ('best', 1301), ('new', 1277), ('many', 1268), ('doesnt', 1267), ('man', 1266), ('scenes', 1265), ('dont', 1210), ('know', 1207), ('hes', 1150), ('great', 1141), ('another', 1111), ('love', 1089), ('action', 1078), ('go', 1075), ('us', 1065), ('director', 1056), ('something', 1048), ('end', 1047), ('still', 1038)]


Perhaps the least common words, those that only appear once across all reviews are not predictive. Perhaps some of the most common words are not useful too. These are good questions and really should be tested with a speciﬁc predictive model. Generally, words that only appear once or a few times across 2,000 reviews are probably not predictive and can be removed from the vocabulary, greatly cutting down on the tokens we need to model. We can do this by stepping through words and their counts and only keeping those with a count above a chosen threshold. Here we will use 5 occurrences.


In [10]:
#keep tokens with >5 occurrence
min_occurrence=5
tokens=[k for k,c in vocab.items() if c >=min_occurrence]
print(len(tokens))
save_list(tokens,"vocab.txt")

14803


Running this final snippet after creating the vocabulary will save the chosen words to file. It is a giid idea to take a look at, and even study, your chosen vocabulary in order to get ideas for better preparing this data, or text data in the future.

Next we can look at using the vocabulary to create a prepared version of the movie review dataset.