In [4]:
from nltk.corpus import movie_reviews

In [6]:
 >>> import nltk
>>> nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\blank\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [5]:
movie_reviews.categories()

['neg', 'pos']

In [7]:
movie_reviews.fileids('pos')

['pos/cv000_29590.txt',
 'pos/cv001_18431.txt',
 'pos/cv002_15918.txt',
 'pos/cv003_11664.txt',
 'pos/cv004_11636.txt',
 'pos/cv005_29443.txt',
 'pos/cv006_15448.txt',
 'pos/cv007_4968.txt',
 'pos/cv008_29435.txt',
 'pos/cv009_29592.txt',
 'pos/cv010_29198.txt',
 'pos/cv011_12166.txt',
 'pos/cv012_29576.txt',
 'pos/cv013_10159.txt',
 'pos/cv014_13924.txt',
 'pos/cv015_29439.txt',
 'pos/cv016_4659.txt',
 'pos/cv017_22464.txt',
 'pos/cv018_20137.txt',
 'pos/cv019_14482.txt',
 'pos/cv020_8825.txt',
 'pos/cv021_15838.txt',
 'pos/cv022_12864.txt',
 'pos/cv023_12672.txt',
 'pos/cv024_6778.txt',
 'pos/cv025_3108.txt',
 'pos/cv026_29325.txt',
 'pos/cv027_25219.txt',
 'pos/cv028_26746.txt',
 'pos/cv029_18643.txt',
 'pos/cv030_21593.txt',
 'pos/cv031_18452.txt',
 'pos/cv032_22550.txt',
 'pos/cv033_24444.txt',
 'pos/cv034_29647.txt',
 'pos/cv035_3954.txt',
 'pos/cv036_16831.txt',
 'pos/cv037_18510.txt',
 'pos/cv038_9749.txt',
 'pos/cv039_6170.txt',
 'pos/cv040_8276.txt',
 'pos/cv041_21113.txt',
 

In [8]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

# cleaning data 

In [12]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
        

In [13]:
documents[4]

(['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')

In [14]:
import random
random.shuffle(documents)
documents[0:5]

[(['i', 'must', 'say', 'from', 'the', 'outset', 'that', ...], 'pos'),
 (['_dirty_work_', 'has', 'a', 'premise', 'of', ...], 'neg'),
 (['what', 'do', 'you', 'get', 'when', 'you', 'slap', ...], 'pos'),
 (['attention', 'moviegoers', ':', 'you', 'are', 'about', ...], 'neg'),
 (['i', 'can', "'", 't', 'recall', 'a', 'previous', ...], 'neg')]

In [21]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else :
        return wordnet.NOUN
        

In [17]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [20]:
from nltk import pos_tag

In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [30]:
def clean_review(words):
    output_words = []
    for w in words:
        if(w.lower() not in stops):
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w,pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words
            
                                                                            

In [31]:
documents = [(clean_review(document),category) for document,category in documents]

In [32]:
documents[4]

(['recall',
  'previous',
  'film',
  'experience',
  'fairly',
  'good',
  'time',
  'turn',
  'sour',
  'quickly',
  'feel',
  'minnesota',
  'forty',
  'minute',
  'lure',
  'loopy',
  'occasionally',
  'wrought',
  'romantic',
  'comedy',
  'even',
  'give',
  'keanu',
  'reef',
  'benefit',
  'doubt',
  'rather',
  'suddenly',
  'clubbed',
  'head',
  'nasty',
  'bit',
  'violence',
  'shortly',
  'thereafter',
  'whack',
  'gut',
  'another',
  'feel',
  'minnesota',
  'film',
  'make',
  'feel',
  'violate',
  'though',
  'trust',
  'writer',
  'director',
  'steven',
  'baigelman',
  'bring',
  'cool',
  'glass',
  'water',
  'instead',
  'threw',
  'acid',
  'face',
  'feel',
  'minnesota',
  'tell',
  'story',
  'topless',
  'dancer',
  'name',
  'freddie',
  'cameron',
  'diaz',
  'find',
  'deep',
  'trouble',
  'bos',
  'red',
  'delroy',
  'lindo',
  'suspect',
  'steal',
  'red',
  'punishment',
  'force',
  'freddie',
  'marry',
  'bookkeeper',
  'sam',
  'clayton',
  '

# Building feature set

In [34]:
training_doc = documents[0:1500]
testing_doc = documents[1500:]

In [36]:
all_words = []
for doc in training_doc:
    all_words += doc[0]

In [38]:
freq = nltk.FreqDist(all_words)

In [43]:
common = freq.most_common(3000)

In [46]:
features = []
for i in common :
    features.append(i[0])

In [47]:
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'well',
 'scene',
 'even',
 'good',
 'story',
 'take',
 'would',
 'much',
 'bad',
 'come',
 'way',
 'give',
 'life',
 'also',
 'look',
 'know',
 'two',
 'first',
 '--',
 'seem',
 'end',
 'work',
 'thing',
 'year',
 'play',
 'plot',
 'say',
 'really',
 'little',
 'show',
 'people',
 'could',
 'never',
 'man',
 'love',
 'try',
 'great',
 'star',
 'director',
 'best',
 'many',
 'performance',
 'actor',
 'big',
 'action',
 'new',
 'want',
 'u',
 'find',
 'watch',
 'think',
 'another',
 'act',
 'role',
 'turn',
 'back',
 'something',
 'still',
 'world',
 'audience',
 'day',
 'use',
 'set',
 'however',
 'every',
 'though',
 'old',
 'guy',
 'begin',
 'real',
 'feel',
 'enough',
 'comedy',
 'run',
 'part',
 'cast',
 'around',
 'point',
 'interest',
 'name',
 'fact',
 'funny',
 'last',
 'long',
 'young',
 'may',
 'right',
 'actually',
 'lot',
 'effect',
 'script',
 'friend',
 'woman',
 'write',
 'place',

In [48]:
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [57]:
training_data = [(get_feature_dict(doc),category) for doc, category in training_doc]

In [61]:
testing_data = [(get_feature_dict(doc),category) for doc, category in testing_doc]

In [56]:
training_data[0]

{'film': True,
 'movie': True,
 'one': True,
 'make': True,
 'like': True,
 'character': True,
 'get': True,
 'see': True,
 'go': True,
 'time': True,
 'well': False,
 'scene': True,
 'even': False,
 'good': True,
 'story': False,
 'take': True,
 'would': True,
 'much': True,
 'bad': False,
 'come': True,
 'way': False,
 'give': True,
 'life': True,
 'also': False,
 'look': True,
 'know': True,
 'two': False,
 'first': False,
 '--': False,
 'seem': True,
 'end': False,
 'work': True,
 'thing': False,
 'year': False,
 'play': True,
 'plot': True,
 'say': True,
 'really': False,
 'little': True,
 'show': False,
 'people': False,
 'could': True,
 'never': True,
 'man': False,
 'love': False,
 'try': False,
 'great': True,
 'star': False,
 'director': True,
 'best': True,
 'many': False,
 'performance': True,
 'actor': True,
 'big': False,
 'action': True,
 'new': True,
 'want': False,
 'u': False,
 'find': True,
 'watch': False,
 'think': False,
 'another': False,
 'act': False,
 'role': 

# Classification 

In [52]:
from nltk import NaiveBayesClassifier

In [58]:
classifier = NaiveBayesClassifier.train(training_data)

In [63]:
nltk.classify.accuracy(classifier,testing_data)

0.774

In [66]:
classifier.show_most_informative_features(15)

Most Informative Features
               ludicrous = True              neg : pos    =     18.8 : 1.0
                religion = True              pos : neg    =      9.8 : 1.0
              schumacher = True              neg : pos    =      9.3 : 1.0
             outstanding = True              pos : neg    =      9.2 : 1.0
             wonderfully = True              pos : neg    =      9.2 : 1.0
                   jolie = True              neg : pos    =      8.0 : 1.0
                 garbage = True              neg : pos    =      7.5 : 1.0
               marvelous = True              pos : neg    =      7.3 : 1.0
                  seagal = True              neg : pos    =      7.1 : 1.0
                 idiotic = True              neg : pos    =      7.0 : 1.0
                   damon = True              pos : neg    =      6.7 : 1.0
              uninspired = True              neg : pos    =      6.7 : 1.0
                  lonely = True              pos : neg    =      6.6 : 1.0