In [None]:
## Loading the data set from nltk itself

In [89]:
from nltk.corpus import movie_reviews

## exploring data

In [90]:
movie_reviews.categories() ## seeing the cat

['neg', 'pos']

In [91]:
len(movie_reviews.fileids())

2000

In [94]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [None]:
## storing all data into list and then shuffling it randomly

In [95]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [96]:
import random 
random.shuffle(documents)

In [97]:
documents[0:5]

[(['after', 'a', 'marketing', 'windup', 'of', 'striking', ...], 'neg'),
 (['written', 'by', 'alex', 'cox', ',', 'tod', 'davies', ...], 'neg'),
 (['john', 'carpenter', 'makes', 'b', '-', 'movies', '.', ...], 'neg'),
 (['1992', "'", 's', 'alien3', 'marked', 'not', 'only', ...], 'pos'),
 (['battlefield', 'earth', 'is', 'the', 'worst', 'film', ...], 'neg')]

## removing the stop words and punctuations then Lemmatizing the remaing data

In [98]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [99]:
from nltk.corpus import stopwords
stops=set(stopwords.words('english'))
import string
punctuations=list(string.punctuation)
stops.update(punctuations)
stops


{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [100]:
lemmatizer = WordNetLemmatizer()

In [101]:
from nltk import pos_tag
w='better'
pos_tag([w])#as it excepts data as array of words not a string or directly words

[('better', 'RBR')]

## Making a function to convert post_tag to general terminology that lemmatizer could understand

In [102]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
## making a function to get post tag, lemmatize then make a list to append it

In [103]:
def clean_review(words):
    output_words = []
    
    for w in words:
        if w.lower() not in stop: 
            pos = pos_tag([w])  # not lowering here since it might reduce the info
            clean_word = lemmatizer.lemmatize(w,pos= get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [104]:
documents = [(clean_review(document), category) for document, category  in documents ] 
## calling function for all data points

In [105]:
documents[0]

(['marketing',
  'windup',
  'strike',
  'visuals',
  'promise',
  'star',
  'caliber',
  'actor',
  'mission',
  'mar',
  'end',
  'throw',
  'whiffleball',
  'fiercely',
  'unoriginal',
  'director',
  'depalma',
  'cobble',
  'together',
  'film',
  'borrowing',
  'heavily',
  'go',
  'alien',
  'similar',
  'close',
  'encounter',
  'third',
  'kind',
  'strand',
  'astronaut',
  'theme',
  'reminiscent',
  'robinson',
  'crusoe',
  'mar',
  'astronaut',
  'encounter',
  'space',
  'flight',
  'difficulty',
  'smack',
  'apollo',
  '13',
  'interior',
  'spacecraft',
  'visuals',
  'redolent',
  '2001',
  'space',
  'odyssey',
  'instead',
  'use',
  'component',
  'launch',
  'pad',
  'create',
  'movie',
  'de',
  'palma',
  'stop',
  'right',
  'refuse',
  'infuse',
  'film',
  'anything',
  'even',
  'remotely',
  'resemble',
  'cleverness',
  'heart',
  'mission',
  'mar',
  'take',
  'first',
  'wobbly',
  'step',
  'pre',
  'launch',
  'barbeque',
  'perfunctory',
  'charact

## building features 

In [106]:
training_documents = documents[0:1500]  ## first doing train test split
test_documents = documents[1500:]
## since data is shuffled therefore can do it directly 
## also we make features from training data only

## making all the features(words) present in a single list

In [107]:
all_words = [] 
for doc in training_documents:
    all_words+=doc[0]  ## since doc is a tuple so first element will be added


In [108]:
import nltk

In [112]:
freq = nltk.FreqDist(all_words)  ## finds the freq distribution of each word
freq

FreqDist({'film': 8512, 'movie': 5142, 'one': 4510, 'make': 3292, 'like': 2996, 'character': 2912, 'get': 2713, 'see': 2380, 'go': 2306, 'time': 2284, ...})

In [113]:
common = freq.most_common(3000) ## to get top 3k most used words as features
features = [i[0] for i in common]

In [114]:
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'well',
 'scene',
 'even',
 'good',
 'story',
 'take',
 'much',
 'would',
 'give',
 'come',
 'also',
 'life',
 'two',
 'bad',
 'look',
 'way',
 'first',
 'seem',
 'know',
 'end',
 '--',
 'year',
 'work',
 'say',
 'thing',
 'plot',
 'play',
 'really',
 'little',
 'show',
 'people',
 'man',
 'could',
 'try',
 'star',
 'performance',
 'never',
 'love',
 'great',
 'best',
 'new',
 'director',
 'action',
 'big',
 'actor',
 'many',
 'watch',
 'u',
 'want',
 'find',
 'think',
 'role',
 'act',
 'another',
 'back',
 'still',
 'something',
 'audience',
 'world',
 'turn',
 'day',
 'old',
 'set',
 'every',
 'however',
 'feel',
 'part',
 'though',
 'enough',
 'begin',
 'use',
 'cast',
 'comedy',
 'point',
 'guy',
 'around',
 'real',
 'run',
 'last',
 'funny',
 'fact',
 'write',
 'woman',
 'young',
 'interest',
 'name',
 'right',
 'long',
 'almost',
 'place',
 'lot',
 'may',
 'script',
 'minute',
 'actually',

In [115]:
## now for each document we want to create a dict where it tell 
## for every feature 'film' True 
            #        'movie' false   ....

In [None]:
## function to check where the text contains the feature (top words) or not

In [120]:
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w]= w in words_set  ## return T/F
    return current_features

In [180]:
output = get_feature_dict(training_documents[0][0])


In [153]:
## for nltk training & testing data should be in form of
## array --> containg tuple + category
## tuple --> contain dict that features(top word) present in text or not [i.e feature + feature value]

In [122]:
training_data = [ (get_feature_dict(doc),category) for doc,category in training_documents] 

In [123]:
test_data = [ (get_feature_dict(doc),category) for doc,category in test_documents] 

In [None]:
## using inbuilt naive bayes classifier in nltk

In [127]:
from nltk import NaiveBayesClassifier

In [128]:
classifier = NaiveBayesClassifier.train(training_data)

In [131]:
nltk.classify.accuracy(classifier, test_data)

0.782

In [133]:
classifier.show_most_informative_features(15) ## top most effective words

Most Informative Features
               ludicrous = True              neg : pos    =     12.8 : 1.0
             outstanding = True              pos : neg    =     10.5 : 1.0
                   jolie = True              neg : pos    =      8.9 : 1.0
              schumacher = True              neg : pos    =      8.9 : 1.0
                  sinise = True              neg : pos    =      7.6 : 1.0
               stupidity = True              neg : pos    =      7.2 : 1.0
                  poorly = True              neg : pos    =      6.9 : 1.0
                   anger = True              pos : neg    =      6.5 : 1.0
                   ideal = True              pos : neg    =      6.5 : 1.0
                   damon = True              pos : neg    =      6.5 : 1.0
                lifeless = True              neg : pos    =      6.5 : 1.0
               criticism = True              pos : neg    =      6.2 : 1.0
             beautifully = True              pos : neg    =      6.2 : 1.0

## using Sklearn models with NLTK Training data

In [172]:
from sklearn.tree import DecisionTreeClassifier

In [173]:
dt = DecisionTreeClassifier()
classifier_sklearn = SklearnClassifier(dt)

In [174]:
classifier_sklearn.train(training_data)

<SklearnClassifier(DecisionTreeClassifier())>

In [175]:
nltk.classify.accuracy(classifier_sklearn, test_data)

0.594

In [168]:
from sklearn.ensemble import RandomForestClassifier

In [169]:
rf = RandomForestClassifier()
classifier_sklearn2 = SklearnClassifier(rf)

In [170]:
classifier_sklearn2.train(training_data)

<SklearnClassifier(RandomForestClassifier())>

In [171]:
nltk.classify.accuracy(classifier_sklearn2, test_data)

0.814