In [1]:
from nltk.corpus import movie_reviews

In [2]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\tushant\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [3]:
movie_reviews.categories()

['neg', 'pos']

In [4]:
# movie_reviews.fileids()

## Cleaning the dataset

In [5]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [6]:
# We are shuffling the dataset
import random
random.shuffle(documents)
documents[0:5]

[(['man', ',', 'this', 'was', 'one', 'wierd', 'movie', ...], 'neg'),
 (['it', 'may', 'seem', 'weird', 'to', 'begin', 'a', ...], 'pos'),
 (['charlie', 'sheen', 'stars', 'as', 'zane', ',', 'a', ...], 'pos'),
 (['finding', 'the', 'courage', 'to', 'face', 'life', ...], 'pos'),
 (['director', 'jan', 'de', 'bont', 'certainly', 'knows', ...], 'pos')]

In [7]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


In [8]:
from nltk.corpus import stopwords
import string 
stop = stopwords.words("english")
punctuation = list(string.punctuation)
stop += punctuation

In [9]:
from nltk.corpus import wordnet
from nltk import pos_tag
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADJ
    else:
        return wordnet.NOUN

In [10]:
# Now we have to aplly the Lemmatization to each word and also removing the stop words.
# To do that we are writing a function
def clean(words):
    output_words = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w,pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [11]:
documents_clean = [(clean(document), category) for document, category in documents]

In [12]:
documents_clean[0]

(['man',
  'one',
  'wierd',
  'movie',
  'similar',
  'conspiracy',
  'theory',
  'decide',
  'genre',
  'first',
  'hour',
  'standard',
  'stock',
  'alien',
  'clone',
  'nicely',
  'create',
  'eerie',
  'atmosphere',
  'ship',
  'last',
  'half',
  'hour',
  'maker',
  'blew',
  'script',
  'airlock',
  'decide',
  'screw',
  'let',
  'kill',
  'everybody',
  'forget',
  'sci',
  'fi',
  'movie',
  'becomes',
  '100',
  'horror',
  'really',
  'dissappointed',
  'movie',
  'try',
  'scare',
  'entirely',
  'wrong',
  'way',
  'instead',
  'use',
  'clever',
  'trick',
  'try',
  'build',
  'scare',
  'movie',
  'us',
  'loud',
  'noise',
  'sudden',
  'camera',
  'shift',
  'short',
  'quick',
  'burst',
  'gore',
  'yawn',
  'everyone',
  'see',
  'know',
  'expect',
  'one',
  'thing',
  'do',
  'well',
  'lead',
  'find',
  'happen',
  'previous',
  'crew',
  'skeleton',
  'lie',
  'around',
  'mangle',
  'mash',
  'finally',
  'painfully',
  'restore',
  'new',
  'crew',
  'v

In [13]:
train_documents = documents_clean[0:1500]
test_documents = documents_clean[1500:]

In [14]:
all_words = []
for document in train_documents:
    all_words += document[0] 

In [15]:
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
features = [x[0] for x in common]

In [16]:
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'good',
 'see',
 'go',
 'time',
 'scene',
 'even',
 'story',
 'take',
 'would',
 'much',
 'come',
 'also',
 'give',
 'well',
 'way',
 'two',
 'bad',
 'life',
 'look',
 '--',
 'first',
 'know',
 'seem',
 'end',
 'year',
 'work',
 'thing',
 'play',
 'plot',
 'really',
 'say',
 'little',
 'show',
 'people',
 'star',
 'man',
 'could',
 'love',
 'great',
 'best',
 'never',
 'new',
 'big',
 'try',
 'director',
 'performance',
 'action',
 'many',
 'find',
 'want',
 'actor',
 'think',
 'watch',
 'u',
 'role',
 'another',
 'act',
 'audience',
 'world',
 'still',
 'back',
 'something',
 'turn',
 'day',
 'however',
 'begin',
 'though',
 'set',
 'old',
 'comedy',
 'use',
 'part',
 'feel',
 'enough',
 'cast',
 'every',
 'around',
 'guy',
 'young',
 'last',
 'run',
 'may',
 'real',
 'interest',
 'point',
 'funny',
 'write',
 'fact',
 'long',
 'name',
 'almost',
 'john',
 'actually',
 'minute',
 'right',
 'effect',
 'woman',
 'script

In [17]:
def get_feature_dict(word):
    current_feature = {}
    word_set = set(word)
    for w in features:
        current_feature[w] = w in word_set
    return current_feature

In [18]:
get_feature_dict(train_documents[0][0])

{'film': False,
 'movie': True,
 'one': True,
 'make': False,
 'like': True,
 'character': False,
 'get': False,
 'good': True,
 'see': True,
 'go': False,
 'time': False,
 'scene': False,
 'even': False,
 'story': False,
 'take': False,
 'would': False,
 'much': True,
 'come': False,
 'also': False,
 'give': False,
 'well': True,
 'way': True,
 'two': False,
 'bad': True,
 'life': False,
 'look': False,
 '--': False,
 'first': True,
 'know': True,
 'seem': False,
 'end': False,
 'year': False,
 'work': True,
 'thing': True,
 'play': False,
 'plot': False,
 'really': True,
 'say': False,
 'little': False,
 'show': False,
 'people': False,
 'star': False,
 'man': True,
 'could': True,
 'love': False,
 'great': False,
 'best': False,
 'never': False,
 'new': True,
 'big': False,
 'try': True,
 'director': False,
 'performance': False,
 'action': False,
 'many': False,
 'find': True,
 'want': False,
 'actor': True,
 'think': False,
 'watch': False,
 'u': True,
 'role': False,
 'another': 

In [19]:
training_data = [ [get_feature_dict(document),category] for document,category in train_documents]

In [20]:
test_data = [ [get_feature_dict(document),category] for document,category in test_documents]

In [21]:
# Applying the classifier
from nltk import NaiveBayesClassifier

In [22]:
clg = NaiveBayesClassifier.train(training_data)

In [23]:
nltk.classify.accuracy(clg, test_data)

0.794

In [24]:
clg.show_most_informative_features(15)

Most Informative Features
               ludicrous = True              neg : pos    =     13.7 : 1.0
              schumacher = True              neg : pos    =     12.3 : 1.0
               stupidity = True              neg : pos    =     10.8 : 1.0
                 idiotic = True              neg : pos    =      9.9 : 1.0
             outstanding = True              pos : neg    =      9.0 : 1.0
             beautifully = True              pos : neg    =      8.8 : 1.0
                   lucas = True              pos : neg    =      7.8 : 1.0
                   anger = True              pos : neg    =      6.8 : 1.0
                  welles = True              neg : pos    =      6.7 : 1.0
                religion = True              pos : neg    =      6.6 : 1.0
               malkovich = True              pos : neg    =      6.6 : 1.0
                   mulan = True              pos : neg    =      6.6 : 1.0
             wonderfully = True              pos : neg    =      6.6 : 1.0

## Using sklearn classifier to classify the the text in nltk.

In [25]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [26]:
svc = SVC()
# We can pass any sklearn classifier into SklearnClassifier(). And it will use that to classify the problem.
classifier = SklearnClassifier(svc)

In [28]:
classifier.train(training_data)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))>

In [29]:
nltk.classify.accuracy(classifier, test_data)

0.784

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
rfc = RandomForestClassifier
classifier1 = SklearnClassifier(svc)

In [32]:
classifier1.train(training_data)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))>

In [35]:
nltk.classify.accuracy(classifier1, test_data)


0.784