'''
Lab 9 - Sentiment Classification

In this script, we will build Naive Bayes classification model for
predicting sentiment (positive or negative).

Dataset: NLTK package comes with sample datasets. We use "movie reviews" dataset.
'''

In [None]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
import collections
from collections import defaultdict
import numpy as np
from nltk.corpus import stopwords
import pandas as pd

In [None]:
# Define 80%/20% split for Training/Testing
SPLIT = 0.8

# Stop Words filtering
stopset = set(stopwords.words('english'))

In [None]:
# Let's have a look at the IMDB dataset
print(movie_reviews)

In [None]:
def word_feats(words):
    feats = defaultdict(lambda: False)
    for word in words:
        # Filter out Stop Words
        if word not in stopset:
            feats[word] = True
    return feats


def evaluate_classifier():

    # Let's go to IMDB movie dataset and obtain IDs.
    # IDs of those movies with POSITIVE sentiment label.
    # IDs of those movies with NEGATIVE sentiment label.
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')

    # Remember "Bag of Words"?
    # We're going to generate that bag-of-words "feature set".
    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

    # Now, we need to split our dataset into 80% training and 20% testing.
    cutoff = int(len(posfeats) *SPLIT)
    trainfeats = negfeats[:cutoff] + posfeats[:cutoff]
    testfeats = negfeats[cutoff:] + posfeats[cutoff:]

    # Let's see how many data points we have in TRAINING dataset and in TESTING dataset.
    print ('Train on %d instances\nTest on %d instances' % (len(trainfeats), len(testfeats)))

    ##### OKAY, we're done with feature preparation ######

    ##### We're ready to build a Naive Bayes classifier #####
    # Let's train our model - based on the TRAINING dataset
    classifier = NaiveBayesClassifier.train(trainfeats)
    # Now let's feed in TESTING dataset and compute accuracy
    print ('Accuracy:', nltk.classify.util.accuracy(classifier, testfeats))

    # Which features were most informative?
    classifier.show_most_informative_features()

    # Now, let's draw a confusion matrix
    pos = [classifier.classify(fs) for (fs, l) in posfeats[cutoff:]]
    pos = np.array(pos)
    neg = [classifier.classify(fs) for (fs, l) in negfeats[cutoff:]]
    neg = np.array(neg)

    # We're done classifying... let's see the results
    print ('Confusion Matrix')
    print ('\t\t', 'Predicted Class')
    print ('-'*40)

    print ('|\t %d (TP) \t|\t %d (FN) \t| Actual class' % ((pos == 'pos').sum(), (pos == 'neg').sum()))
    print ('|\t %d (FP) \t|\t %d (TN) \t|' % ((neg == 'pos').sum(), (neg == 'neg').sum()))

    print ('-'*40)

In [None]:
################# MAIN #################
evaluate_classifier()