## NLP on Movie_Reviews Dataset

In [1]:
from nltk.corpus import movie_reviews

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
print(len(movie_reviews.fileids('pos')))
print(len(movie_reviews.fileids('neg')))

1000
1000


In [4]:
movie_reviews.words(movie_reviews.fileids('pos')[5])

['on', 'june', '30', ',', '1960', ',', 'a', 'self', ...]

In [5]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [6]:
import random
random.shuffle(documents)
documents[:5]

[(['steve', 'martin', 'is', 'one', 'of', 'the', ...], 'neg'),
 (['all', 'right', ',', 'all', 'right', ',', 'we', 'get', ...], 'neg'),
 (['oh', 'god', 'how', 'many', 'john', 'grisham', ...], 'pos'),
 (['"', 'i', 'know', 'what', 'you', 'did', 'last', ...], 'pos'),
 (['"', 'marie', 'couldn', "'", 't', 'talk', ',', '"', ...], 'neg')]

In [7]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
import string
stop_words = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stop_words.update(punctuations)
print(stop_words, string.punctuation)

{'this', 'these', 'having', '{', 'themselves', 's', 'out', 'up', 'until', 'hasn', "shan't", 'being', 'i', 'was', 'were', 'shouldn', ',', "didn't", "mightn't", 'me', 'they', 'him', "needn't", 'just', 'with', "you'll", 'after', 'why', 'himself', '-', 'd', 'not', '>', 'further', 'between', "couldn't", 'through', 'needn', 'against', 'during', 'now', 'doesn', 'yourselves', ':', "wouldn't", 'there', "hadn't", 'the', 'own', 'm', 'be', "won't", '/', '+', 'more', 'no', 'at', "it's", 'in', 'about', 'off', 'she', 'do', 'couldn', 'didn', 'my', 'if', 'your', "you'd", 'its', 'or', 'most', 'nor', "you're", 'does', 'weren', 'each', 'isn', 'because', '\\', 'it', 'which', 'again', 'y', 'whom', 'under', 'been', 'won', '[', '}', '&', 'hers', 'how', 'below', 'when', "doesn't", "shouldn't", 'any', '_', "mustn't", 'wouldn', 'herself', 'those', 'theirs', 'all', 'wasn', 'of', 'them', "that'll", 'don', 'ma', 'hadn', 'so', "hasn't", 'only', 'what', 'are', '^', 'other', 'same', 'our', 'that', 'such', "'", 'her', 

In [8]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stop_words:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [10]:
documents = [(clean_review(document), category) for document, category in documents]

In [11]:
print(documents[0])

(['steve', 'martin', 'one', 'funniest', 'men', 'alive', 'take', 'true', 'statement', 'disappointment', 'film', 'equal', 'mine', 'martin', 'hilarious', 'create', 'best', 'laugh', 'loud', 'experience', 'ever', 'take', 'place', 'movie', 'theater', 'find', 'old', 'television', 'series', 'base', 'moment', 'humor', 'wit', 'bilko', 'name', 'accident', 'head', 'army', 'motor', 'pool', 'group', 'passion', 'scheme', 'every', 'episode', 'involves', 'sergeant', 'men', 'one', 'another', 'hair', 'brain', 'plan', 'get', 'rich', 'quick', 'outwit', 'officer', 'base', 'mchale', 'navy', 'granddaddy', 'idea', 'behind', 'movie', 'difference', 'far', 'fetch', 'usually', 'goofy', 'television', 'series', 'funny', 'one', 'laugh', 'film', 'make', 'retains', 'goofiness', 'entertainment', 'everything', 'clean', 'obviously', 'make', 'hollywood', 'back', 'lot', 'look', 'every', 'bit', 'like', 'look', 'brand', 'new', 'even', 'old', 'beat', 'stuff', 'martin', 'remarkably', 'small', 'big', 'life', 'role', 'original', 

In [12]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]

In [13]:
all_words = []
for doc in training_documents:
    all_words += doc[0]

In [14]:
import nltk

In [15]:
freq = nltk.FreqDist(all_words) # Frequency Distribution i.e the freq of occurance of a particular word
common = freq.most_common(3000) # The most frequent words also referred as most_common(3000)
features = [i[0] for i in common]

In [16]:
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [17]:
get_feature_dict(training_documents[0][0])

{'film': True,
 'movie': True,
 'one': True,
 'make': True,
 'like': True,
 'character': True,
 'get': True,
 'see': False,
 'time': False,
 'go': False,
 'well': False,
 'scene': False,
 'even': True,
 'good': False,
 'story': False,
 'take': True,
 'would': False,
 'much': True,
 'also': False,
 'two': False,
 'come': False,
 'give': False,
 'life': True,
 'seem': False,
 'way': False,
 'bad': False,
 'look': True,
 '--': False,
 'know': False,
 'end': False,
 'first': False,
 'year': False,
 'work': False,
 'thing': False,
 'plot': False,
 'say': True,
 'play': False,
 'really': False,
 'people': False,
 'little': False,
 'show': False,
 'love': False,
 'could': False,
 'man': False,
 'try': False,
 'star': False,
 'never': False,
 'great': False,
 'performance': False,
 'best': True,
 'director': False,
 'new': True,
 'big': True,
 'action': False,
 'many': False,
 'actor': False,
 'want': False,
 'u': False,
 'watch': False,
 'find': True,
 'think': False,
 'role': True,
 'act': F

In [18]:
training_data = [(get_feature_dict(doc), category) for doc, category in training_documents]
testing_data = [(get_feature_dict(doc), category) for doc, category in testing_documents]

In [19]:
training_data[0]

({'film': True,
  'movie': True,
  'one': True,
  'make': True,
  'like': True,
  'character': True,
  'get': True,
  'see': False,
  'time': False,
  'go': False,
  'well': False,
  'scene': False,
  'even': True,
  'good': False,
  'story': False,
  'take': True,
  'would': False,
  'much': True,
  'also': False,
  'two': False,
  'come': False,
  'give': False,
  'life': True,
  'seem': False,
  'way': False,
  'bad': False,
  'look': True,
  '--': False,
  'know': False,
  'end': False,
  'first': False,
  'year': False,
  'work': False,
  'thing': False,
  'plot': False,
  'say': True,
  'play': False,
  'really': False,
  'people': False,
  'little': False,
  'show': False,
  'love': False,
  'could': False,
  'man': False,
  'try': False,
  'star': False,
  'never': False,
  'great': False,
  'performance': False,
  'best': True,
  'director': False,
  'new': True,
  'big': True,
  'action': False,
  'many': False,
  'actor': False,
  'want': False,
  'u': False,
  'watch': Fals

In [20]:
from nltk import NaiveBayesClassifier

In [21]:
classifier = NaiveBayesClassifier.train(training_data)

In [22]:
nltk.classify.accuracy(classifier, testing_data)

0.824

In [23]:
classifier.show_most_informative_features(15)

Most Informative Features
             outstanding = True              pos : neg    =     10.6 : 1.0
              schumacher = True              neg : pos    =     10.3 : 1.0
                   anger = True              pos : neg    =      9.4 : 1.0
                  seagal = True              neg : pos    =      8.9 : 1.0
                  turkey = True              neg : pos    =      7.8 : 1.0
            respectively = True              pos : neg    =      7.6 : 1.0
                 idiotic = True              neg : pos    =      6.9 : 1.0
                   damon = True              pos : neg    =      6.8 : 1.0
              uninspired = True              neg : pos    =      6.7 : 1.0
                  castle = True              pos : neg    =      6.7 : 1.0
                     gem = True              pos : neg    =      6.6 : 1.0
             wonderfully = True              pos : neg    =      6.0 : 1.0
                   awful = True              neg : pos    =      5.9 : 1.0

In [24]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [25]:
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)

In [26]:
classifier_sklearn.train(training_data)

<SklearnClassifier(SVC())>

In [27]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.866

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
rfc = RandomForestClassifier()
classifier_sklearn1 = SklearnClassifier(rfc)

In [30]:
classifier_sklearn1.train(training_data)

<SklearnClassifier(RandomForestClassifier())>

In [31]:
nltk.classify.accuracy(classifier_sklearn1, testing_data)

0.832