In [91]:
# movie dataset
from nltk.corpus import movie_reviews

# for shuffling
import random

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
import string
import nltk

# nltk NaivesBayes Classifier
from nltk import NaiveBayesClassifier

# SVM classifier
from sklearn.svm import SVC

# Random forest classifier
from sklearn.ensemble import RandomForestClassifier

# dummy classifier -> converts nltk format data (dictionary, category) tuple to sklearn format data (X, Y) array.
from nltk.classify.scikitlearn import SklearnClassifier

In [3]:
# Accessing categories of movie reviews
movie_reviews.categories()

['neg', 'pos']

In [15]:
# Accessing actual movie review
# first access the file ids or file names
movie_reviews.fileids()

# length of the database
len(movie_reviews.fileids())

# accessing positive or negative files only
movie_reviews.fileids('neg')
movie_reviews.fileids('pos')

# getting words from the files
movie_reviews.words(movie_reviews.fileids()[5])
movie_reviews.words(movie_reviews.fileids()[1000])

['films', 'adapted', 'from', 'comic', 'books', 'have', ...]

Cleaning the dataset

In [23]:
# Storing the words and categories in array format
"""
In the documents[], each element will be a tuple.
Tuple -> words, categories.
Words will be bascially an array

Format -> documents[0] = [([word1, word2,... wordn], pos)]

Code working:
First access the categories or type of movie review files. // outer loop
Then, accessing all that one particular category files. // inner loop
Storing, words of each file along with it's category. // code inside inner loop
"""

documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))

documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [25]:
"""
Now, we are going to shuffle the fie to make it easier for train and test file splits.
Currently, documents[] contains 1000 negative file initially then 1000 positive file.
"""

random.shuffle(documents)

documents[0:10]

[(['the', 'art', 'of', 'woo', 'attempts', 'to', 'be', ...], 'neg'),
 (['capsule', ':', 'combine', 'one', 'quart', 'of', ...], 'neg'),
 (['carry', 'on', 'matron', 'is', 'the', 'last', 'great', ...], 'pos'),
 (['perhaps', 'the', 'most', 'dramatic', 'changes', 'in', ...], 'pos'),
 (['these', 'days', ',', 'we', 'are', 'witnessing', ...], 'neg'),
 (['plot', ':', 'a', 'peculiar', 'french', 'girl', ...], 'pos'),
 (['wild', 'things', 'is', 'a', 'way', 'to', 'steam', ...], 'neg'),
 (['for', 'any', 'groom', 'on', 'the', 'verge', 'of', ...], 'pos'),
 (['no', 'humans', 'were', 'harmed', ',', 'tested', 'or', ...], 'pos'),
 (['in', 'the', 'interest', 'of', 'being', 'generous', ...], 'neg')]

In [51]:
def get_simple_pos(pos):
    if pos.startswith('J'):
        return wordnet.ADJ
    elif pos.startswith('V'):
        return wordnet.VERB
    elif pos.startswith('N'):
        return wordnet.NOUN
    elif pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [52]:
"""
Example of how pos_tag works
"""
# w = "better"
# pos_tag([w])

'\nExample of how pos_tag works\n'

In [53]:
# creating stops set
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

stops

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [62]:
"""
Cleaning function:
1. Remove stop words
2. Perform lemmatization
"""

def clean_review(words):
    output_words = []
    
    # accessing each words
    for each_word in words:
        # checking if it is a stop word or not
        if each_word.lower() not in stops:
            # adding the word to output_words[] after lemmatizing it
            # 1. getting pos tag for it
            pos = pos_tag([each_word]) # pos_tag requires an array to be passed
            """
            since, we only need the pos_tag and not the word
            we use pos[0][1]
            """
            clean_word = lemmatizer.lemmatize(each_word, get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    
    return output_words

In [64]:
# cleaning documents[]
"""
Here, document is list of words which gets passed into clean_review().
Then, inside there is a loop which accesses each word from the list of words passed.
"""
documents_cleaned = [(clean_review(document), category) for document, category in documents]

In [75]:
documents_cleaned[0]

(['art',
  'woo',
  'attempt',
  'one',
  'film',
  'like',
  'breakfast',
  'tiffany',
  'audience',
  'root',
  'sweet',
  'vulnerable',
  'irresistible',
  'woman',
  'work',
  'problem',
  'find',
  'happiness',
  'problem',
  'helen',
  'lee',
  'writes',
  'directs',
  'seem',
  'write',
  'alessa',
  'woo',
  'played',
  'sook',
  'yin',
  'lee',
  'neither',
  'sweet',
  'vulnerable',
  'quite',
  'resistible',
  'alessa',
  'young',
  'woman',
  'happens',
  'brilliant',
  'art',
  'dealer',
  'toronto',
  'art',
  'scene',
  'sort',
  'alternate',
  'world',
  'art',
  'scene',
  'people',
  'pay',
  'ten',
  'thousand',
  'dollar',
  'painting',
  'talented',
  'beginner',
  'dealer',
  'painting',
  'fly',
  'back',
  'forth',
  'place',
  'like',
  'switzerland',
  'one',
  'knowledgeable',
  'art',
  'dealer',
  'alessa',
  'also',
  'happens',
  'center',
  'adulation',
  'friend',
  'every',
  'party',
  'suitor',
  'camped',
  'outside',
  'window',
  'next',
  'door',

Building feature set

In [66]:
# Splitting into training and testing
training_documents_cleaned = documents_cleaned[0:1500]
testing_documents_cleaned = documents_cleaned[1500:]

In [67]:
"""
nltk requires to pass in this data form = {dictionary}, category
dictionary consist of features, where features are words from the documents_cleaned[]
"""

# Creating an array consisting of all the words from the documents_cleaned[]
all_words = []

for doc in training_documents_cleaned:
    all_words += doc[0]

all_words

['art',
 'woo',
 'attempt',
 'one',
 'film',
 'like',
 'breakfast',
 'tiffany',
 'audience',
 'root',
 'sweet',
 'vulnerable',
 'irresistible',
 'woman',
 'work',
 'problem',
 'find',
 'happiness',
 'problem',
 'helen',
 'lee',
 'writes',
 'directs',
 'seem',
 'write',
 'alessa',
 'woo',
 'played',
 'sook',
 'yin',
 'lee',
 'neither',
 'sweet',
 'vulnerable',
 'quite',
 'resistible',
 'alessa',
 'young',
 'woman',
 'happens',
 'brilliant',
 'art',
 'dealer',
 'toronto',
 'art',
 'scene',
 'sort',
 'alternate',
 'world',
 'art',
 'scene',
 'people',
 'pay',
 'ten',
 'thousand',
 'dollar',
 'painting',
 'talented',
 'beginner',
 'dealer',
 'painting',
 'fly',
 'back',
 'forth',
 'place',
 'like',
 'switzerland',
 'one',
 'knowledgeable',
 'art',
 'dealer',
 'alessa',
 'also',
 'happens',
 'center',
 'adulation',
 'friend',
 'every',
 'party',
 'suitor',
 'camped',
 'outside',
 'window',
 'next',
 'door',
 'alessa',
 'move',
 'struggle',
 'genius',
 'artist',
 'native',
 'american',
 'ben

In [72]:
# finding frequency of all the words
freq = nltk.FreqDist(all_words)

# most common words based on frequency
# pass a number which generates top x number of words
common = freq.most_common(3000)

# making a features[] with words only as it's entry
features = [entry[0] for entry in common]
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'well',
 'even',
 'scene',
 'good',
 'story',
 'take',
 'would',
 'much',
 'bad',
 'come',
 'also',
 'look',
 'life',
 'two',
 'give',
 'way',
 'know',
 'end',
 'seem',
 'first',
 'year',
 '--',
 'work',
 'thing',
 'plot',
 'play',
 'say',
 'really',
 'little',
 'show',
 'people',
 'love',
 'could',
 'man',
 'never',
 'star',
 'great',
 'director',
 'new',
 'try',
 'performance',
 'best',
 'action',
 'actor',
 'many',
 'big',
 'find',
 'u',
 'want',
 'watch',
 'think',
 'act',
 'role',
 'another',
 'back',
 'something',
 'still',
 'audience',
 'turn',
 'world',
 'old',
 'however',
 'use',
 'day',
 'set',
 'begin',
 'every',
 'cast',
 'real',
 'last',
 'though',
 'interest',
 'part',
 'feel',
 'comedy',
 'guy',
 'enough',
 'run',
 'around',
 'name',
 'point',
 'right',
 'long',
 'script',
 'john',
 'young',
 'write',
 'effect',
 'may',
 'fact',
 'funny',
 'woman',
 'minute',
 'place',
 'nothing',

In [76]:
# making the feature dictionary
def get_feature_dict(words):
    feature_dict = {}
    
    words_set = set(words)
    # taking each feature
    for each_feature in features:
        # sets each_feature as the key and True/False as value if present/not present respectively
        feature_dict[each_feature] = each_feature in words_set
    
    return feature_dict

In [74]:
training_data_cleaned = [(get_feature_dict(doc), category) for doc, category in training_documents_cleaned]

In [77]:
testing_data_cleaned = [(get_feature_dict(doc), category) for doc, category in testing_documents_cleaned]

In [81]:
# training the model using NB classifier
classifier = NaiveBayesClassifier.train(training_data_cleaned)

In [82]:
# finding the accuracy of the model
nltk.classify.accuracy(classifier, testing_data_cleaned)

0.786

In [83]:
classifier.show_most_informative_features(15)

Most Informative Features
             outstanding = True              pos : neg    =     31.7 : 1.0
               stupidity = True              neg : pos    =     16.0 : 1.0
               ludicrous = True              neg : pos    =      9.1 : 1.0
              schumacher = True              neg : pos    =      8.8 : 1.0
                   anger = True              pos : neg    =      7.8 : 1.0
                 idiotic = True              neg : pos    =      7.7 : 1.0
                religion = True              pos : neg    =      7.4 : 1.0
                   mulan = True              pos : neg    =      7.2 : 1.0
                 freddie = True              neg : pos    =      6.9 : 1.0
                    lame = True              neg : pos    =      6.7 : 1.0
             beautifully = True              pos : neg    =      6.7 : 1.0
              uninspired = True              neg : pos    =      6.6 : 1.0
                  anakin = True              pos : neg    =      6.5 : 1.0

Convert the data into X (input), Y (output) array format which can be automatically done by the nltk dummy classifier

1. Applying SVM classifier

In [86]:
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)

In [87]:
classifier_sklearn.train(training_data_cleaned)

<SklearnClassifier(SVC())>

In [88]:
nltk.classify.accuracy(classifier_sklearn, testing_data_cleaned)

0.838

2. Applying random forest

In [96]:
rfc = RandomForestClassifier()
classifier_sklearn1 = SklearnClassifier(rfc)

In [97]:
classifier_sklearn1.train(training_data_cleaned)

<SklearnClassifier(RandomForestClassifier())>

In [98]:
nltk.classify.accuracy(classifier_sklearn1, testing_data_cleaned)

0.804