In [1]:
import nltk
from nltk.corpus import movie_reviews

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
len(movie_reviews.fileids())

2000

In [4]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [5]:
#making a tuple like (word, label)
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg'),
 (['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...], 'neg'),
 (['so', 'ask', 'yourself', 'what', '"', '8mm', '"', '(', ...], 'neg'),
 (['that', "'", 's', 'exactly', 'how', 'long', 'the', ...], 'neg'),
 (['call', 'it', 'a', 'road', 'trip', 'for', 'the', ...], 'neg'),
 (['plot', ':', 'a', 'young', 'french', 'boy', 'sees', ...], 'neg'),
 (['best', 'remembered', 'for', 'his', 'understated', ...], 'neg'),
 (['janeane', 'garofalo', 'in', 'a', 'romantic', ...], 'neg'),
 (['and', 'now', 'the', 'high', '-', 'flying', 'hong', ...], 'neg'),
 (['a', 'movie', 'like', 'mortal', 'kombat', ':', ...], 'neg'),
 (['she', 'was', 'the', 'femme', 'in', 

In [6]:
import random
random.shuffle(documents)
documents[0:5]

[(['even', 'though', 'i', 'have', 'the', 'utmost', ...], 'neg'),
 (['*', '*', '*', '*', '*', '*', 'minor', 'plot', ...], 'pos'),
 (['after', 'the', 'huge', 'success', 'of', '"', 'the', ...], 'neg'),
 (['you', 'know', 'something', ',', 'christmas', 'is', ...], 'neg'),
 (['no', 'filmmaker', 'deconstructs', 'a', 'story', 'as', ...], 'pos')]

In [7]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [8]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
#there's a problem with this function
from nltk import pos_tag
w = "better"
pos_tag(w) #pos_tag expects to get an array

[('b', 'NN'), ('e', 'NN'), ('t', 'NN'), ('t', 'NN'), ('e', 'NN'), ('r', 'NN')]

In [10]:
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
import string
punctuations = list(string.punctuation)
stops.update(punctuations)

In [11]:
#We need to do cleaning: removing stopwords + lemmatization
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w,
                                 pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [12]:
documents = [(clean_review(document), category) for document, category in documents]

In [13]:
documents[0]

(['even',
  'though',
  'utmost',
  'respect',
  'richard',
  'dreyfus',
  'actor',
  'presence',
  'motion',
  'picture',
  'guarantee',
  'particular',
  'level',
  'quality',
  'like',
  'everyone',
  'else',
  'dreyfus',
  'bill',
  'pay',
  'occasionally',
  'accepts',
  'big',
  'paycheck',
  'prominent',
  'role',
  'bad',
  'movie',
  'consequently',
  'career',
  'highlight',
  'include',
  'jaw',
  'close',
  'encounter',
  'third',
  'kind',
  'stakeout',
  'tin',
  'men',
  'resume',
  'dot',
  'title',
  'like',
  'moon',
  'parador',
  'let',
  'ride',
  'krippendorf',
  'tribe',
  'let',
  'start',
  'say',
  'krippendorf',
  'tribe',
  'occasionally',
  'funny',
  'although',
  'never',
  'riotously',
  'asset',
  'best',
  'word',
  'describe',
  'film',
  'asinine',
  'target',
  'audience',
  'would',
  'appear',
  'recent',
  'nursery',
  'school',
  'graduate',
  'numerous',
  'sexual',
  'innuendo',
  'aim',
  'someone',
  'go',
  'puberty',
  'krippendorf',
  'tr

In [19]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]

In [20]:
#In nltk classification, we need to provide a dictionary
#with all the features i.e. f1 and value corresponding to f1
#and we need to provide the category
#we will basically convert the document into touples of
#dictionary of each feature and their category
all_words = []
for doc in training_documents:
    all_words += doc[0]


In [23]:
import nltk
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
features = [i[0] for i in common]

In [27]:
def get_features_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [29]:
get_features_dict(training_documents[0][0])

{'film': True,
 'movie': True,
 'one': True,
 'make': False,
 'like': True,
 'character': True,
 'get': False,
 'see': True,
 'go': True,
 'time': True,
 'well': False,
 'scene': False,
 'even': True,
 'good': False,
 'story': False,
 'take': False,
 'would': True,
 'much': True,
 'come': True,
 'also': False,
 'give': True,
 'life': False,
 'two': True,
 'look': False,
 'way': True,
 'know': True,
 'bad': True,
 'first': True,
 'end': False,
 '--': True,
 'seem': True,
 'year': True,
 'work': True,
 'thing': True,
 'plot': True,
 'play': False,
 'say': True,
 'really': False,
 'people': True,
 'little': True,
 'show': False,
 'could': False,
 'man': False,
 'great': False,
 'star': True,
 'never': True,
 'try': True,
 'best': True,
 'love': True,
 'director': True,
 'new': True,
 'many': False,
 'actor': True,
 'performance': False,
 'big': True,
 'find': False,
 'watch': False,
 'want': True,
 'action': False,
 'u': False,
 'role': True,
 'think': False,
 'another': False,
 'act': Tr

In [31]:
training_data = [(get_features_dict(doc), category) for doc, category in training_documents]
testing_data = [(get_features_dict(doc), category) for doc, category in testing_documents]

In [33]:
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(training_data)

In [35]:
nltk.classify.accuracy(classifier, testing_data)

0.812

In [36]:
classifier.show_most_informative_features(15)

Most Informative Features
             wonderfully = True              pos : neg    =     13.8 : 1.0
               ludicrous = True              neg : pos    =     12.8 : 1.0
                  castle = True              pos : neg    =     11.8 : 1.0
              dreamworks = True              pos : neg    =     11.1 : 1.0
             outstanding = True              pos : neg    =      9.4 : 1.0
                   anger = True              pos : neg    =      8.0 : 1.0
                   mulan = True              pos : neg    =      7.9 : 1.0
               stupidity = True              neg : pos    =      7.3 : 1.0
                  turkey = True              neg : pos    =      6.7 : 1.0
            breathtaking = True              pos : neg    =      6.2 : 1.0
                  anakin = True              pos : neg    =      6.0 : 1.0
                  random = True              neg : pos    =      5.7 : 1.0
                  seagal = True              neg : pos    =      5.7 : 1.0