# Text Classification for Sentiment Analysis – Naive Bayes Classifier

In [8]:
from nltk.corpus import movie_reviews as mr
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import string
stop = stopwords.words('english')
all_words = FreqDist(w.lower() for w in mr.words() if w.lower() not in stop and w.lower() not in string.punctuation)
print all_words

<FreqDist with 39610 samples and 719237 outcomes>


In [13]:
import string
from itertools import chain

In [14]:
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
import nltk

In [15]:
stop = stopwords.words('english')
documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]

In [16]:
word_features = FreqDist(chain(*[i for i,j in documents]))
word_features = word_features.keys()[:100]

In [17]:
numtrain = int(len(documents) * 90 / 100)
train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]]
test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[numtrain:]]

In [21]:
classifier = nbc.train(train_set)
print nltk.classify.accuracy(classifier, test_set)
classifier.show_most_informative_features(10)

0.31
Most Informative Features
               uplifting = True              pos : neg    =      5.9 : 1.0
               wednesday = True              pos : neg    =      3.7 : 1.0
             controversy = True              pos : neg    =      3.4 : 1.0
                  shocks = True              pos : neg    =      3.0 : 1.0
                  catchy = True              pos : neg    =      2.6 : 1.0
           appropriately = True              pos : neg    =      2.5 : 1.0
                   askew = True              pos : neg    =      2.3 : 1.0
                bringing = True              pos : neg    =      2.1 : 1.0
              unsinkable = True              pos : neg    =      2.1 : 1.0
                 spiders = True              pos : neg    =      2.1 : 1.0
              projection = True              pos : neg    =      2.1 : 1.0
                  crotch = True              neg : pos    =      1.9 : 1.0
               francesca = True              neg : pos    =      1.9 

NameError: name 'plot' is not defined

# Most Frequent words

In [23]:
import nltk
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

#print(documents[1])

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))

[(u',', 77717), (u'the', 76529), (u'.', 65876), (u'a', 38106), (u'and', 35576), (u'of', 34123), (u'to', 31937), (u"'", 30585), (u'is', 25195), (u'in', 21822), (u's', 18513), (u'"', 17612), (u'it', 16107), (u'that', 15924), (u'-', 15595)]


# Sentiment Analysis – Naive Bayes Classifier

In [25]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

In [27]:
def word_feats(words):
    return dict([(word, True) for word in words])

In [28]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
 

In [30]:
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

In [31]:
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4

In [33]:
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

train on 1500 instances, test on 500 instances


In [35]:
classifier = NaiveBayesClassifier.train(trainfeats)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
classifier.show_most_informative_features()

accuracy: 0.728
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


# Text Classification for Sentiment Analysis – Stopwords and Collocations

In [58]:
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures


In [59]:
def evaluate_classifier(featx):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')

In [60]:
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

In [None]:
evaluate_classifiers(bigram_word_feats)