In [32]:
# movie dataset
from nltk.corpus import movie_reviews

# for shuffling
import random

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
import string
import nltk
from sklearn.model_selection import train_test_split

# count vectorizer import
from sklearn.feature_extraction.text import CountVectorizer

# RandomForest import
from sklearn.ensemble import RandomForestClassifier

# SVM classifier
from sklearn.svm import SVC

In [2]:
# Accessing categories of movie reviews
movie_reviews.categories()

['neg', 'pos']

In [3]:
# Accessing actual movie review
# first access the file ids or file names
movie_reviews.fileids()

# length of the database
len(movie_reviews.fileids())

# accessing positive or negative files only
movie_reviews.fileids('neg')
movie_reviews.fileids('pos')

# getting words from the files
movie_reviews.words(movie_reviews.fileids()[5])
movie_reviews.words(movie_reviews.fileids()[1000])

['films', 'adapted', 'from', 'comic', 'books', 'have', ...]

In [4]:
# Storing the words and categories in array format
"""
In the documents[], each element will be a tuple.
Tuple -> words, categories.
Words will be bascially an array

Format -> documents[0] = [([word1, word2,... wordn], pos)]

Code working:
First access the categories or type of movie review files. // outer loop
Then, accessing all that one particular category files. // inner loop
Storing, words of each file along with it's category. // code inside inner loop
"""

documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))

documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [5]:
"""
Now, we are going to shuffle the fie to make it easier for train and test file splits.
Currently, documents[] contains 1000 negative file initially then 1000 positive file.
"""

random.shuffle(documents)

documents[0:10]

[(['michael', 'crichton', 'has', 'had', 'a', 'long', ...], 'neg'),
 (['several', 'days', 'after', 'having', 'seen', 'this', ...], 'neg'),
 (['the', 'previews', 'for', 'the', 'movie', 'are', ...], 'neg'),
 (['urban', 'legend', 'surprised', 'me', '.', 'based', ...], 'pos'),
 (['before', 'the', 'remake', 'of', 'psycho', 'appears', ...], 'neg'),
 (['according', 'to', 'popular', 'film', 'opinion', ',', ...], 'neg'),
 (['in', 'december', 'of', '1996', ',', 'a', 'little', ...], 'pos'),
 (['man', ',', 'this', 'was', 'one', 'wierd', 'movie', ...], 'neg'),
 (['8mm', ',', 'written', 'by', 'seven', 'scribe', ...], 'neg'),
 (['american', 'pie', 'acknowledges', 'a', 'cold', ',', ...], 'pos')]

In [6]:
def get_simple_pos(pos):
    if pos.startswith('J'):
        return wordnet.ADJ
    elif pos.startswith('V'):
        return wordnet.VERB
    elif pos.startswith('N'):
        return wordnet.NOUN
    elif pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [7]:
# creating stops set
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

stops

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [8]:
"""
Cleaning function:
1. Remove stop words
2. Perform lemmatization
"""

def clean_review(words):
    output_words = []
    
    # accessing each words
    for each_word in words:
        # checking if it is a stop word or not
        if each_word.lower() not in stops:
            # adding the word to output_words[] after lemmatizing it
            # 1. getting pos tag for it
            pos = pos_tag([each_word]) # pos_tag requires an array to be passed
            """
            since, we only need the pos_tag and not the word
            we use pos[0][1]
            """
            clean_word = lemmatizer.lemmatize(each_word, get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    
    return output_words

In [9]:
# cleaning documents[]
"""
Here, document is list of words which gets passed into clean_review().
Then, inside there is a loop which accesses each word from the list of words passed.
"""
documents_cleaned = [(clean_review(document), category) for document, category in documents]

In [10]:
# getting all the categories of the documents_cleaned[] into a single array
categories = [category for document, category in documents_cleaned]

In [11]:
# making all the words into a single sentence with space in b/w
text_documents = [" ".join(document) for document, category in documents_cleaned]

In [17]:
# splitting training and testing
x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)

In [37]:
# Applying count vectorizer
count_vect = CountVectorizer(max_features = 2000, ngram_range = (2, 3))
x_train_features = count_vect.fit_transform(x_train)

x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 2],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [38]:
x_train_features

<1500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 40949 stored elements in Compressed Sparse Row format>

In [40]:
count_vect.get_feature_names()

['10 10',
 '10 scale',
 '10 thing',
 '10 thing hate',
 '10 year',
 '100 million',
 '14 year',
 '15 minute',
 '15 year',
 '1999 eugene',
 '1999 eugene novikov',
 '19th century',
 '20 minute',
 '2001 space',
 '2001 space odyssey',
 '20th century',
 '30 minute',
 '30 year',
 '90 minute',
 'able make',
 'absolutely nothing',
 'academy award',
 'ace ventura',
 'act ability',
 'act film',
 'act like',
 'act talent',
 'action adventure',
 'action comedy',
 'action film',
 'action flick',
 'action hero',
 'action movie',
 'action packed',
 'action scene',
 'action sequence',
 'action thriller',
 'actor film',
 'actor play',
 'actually get',
 'actually quite',
 'actually work',
 'adam sandler',
 'adult film',
 'african american',
 'al pacino',
 'alan smithee',
 'albert brook',
 'alec baldwin',
 'ali larter',
 'alien alien',
 'alien film',
 'alien race',
 'alien resurrection',
 'almost always',
 'almost entirely',
 'almost every',
 'along line',
 'along way',
 'already know',
 'also direct',
 'a

In [41]:
# getting results on testing data by count vectorizer
x_test_features = count_vect.transform(x_test)

In [42]:
x_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 12513 stored elements in Compressed Sparse Row format>

Now we can use any sklearn classifier to train the model.

In [43]:
# Applying RandomForest classifier
rfc = RandomForestClassifier(n_estimators = 2000)
rfc.fit(x_train_features, y_train)

In [44]:
rfc.score(x_test_features, y_test)

0.716

In [45]:
# Applying SVM classifier
svc = SVC()
svc.fit(x_train_features, y_train)

In [46]:
svc.score(x_test_features, y_test)

0.696