### Importing Files

In [35]:
import nltk
import random
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize

### The Vote Classifier Class 

In [38]:
nltk.usage(nltk.classify.ClassifierI)

ClassifierI supports the following operations:
  - self.classify(featureset)
  - self.classify_many(featuresets)
  - self.labels()
  - self.prob_classify(featureset)
  - self.prob_classify_many(featuresets)


Each time a classifier selects a category for a document, it inserts the **category into an array**. That is its vote. So you end up with an array of 'votes' (vote can be positive or negative).

Now for example, if there are 10 classifiers and 7 voted 'pos', while 3 voted 'neg'. Then it uses mode of the array which brings back the most frequently occurring value in the array ('pos' in this example). Then it finds the number of times that value 'pos' occurs in the array. It divides that number of occurences number of pos (which is 7) by the total (which is 10) to give 70% as the confidence. Because 70% of the classifiers voted (classified) the document that way.

In [36]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

### Getting Data and Feature Engineering 

In the following code we have done below mentioned operations
- Read from the files.
- appended reviews to documents list along with its **classified category**.
- **Tokenized** words sequentially line by line using `pos_tag` to append words that are only **adjective**.
- Saved our document using pickle.
- Created a **frequency distribution** of appended words to mark out the 5000 most occured words to be used for training.
- Defined a function that takes document as argument and returns a tuple with each word against a boolen value for being in most occured or word_features list.
- Created a featuresets list that has data to be trained in the format created by above points.
- Shuffled and Splitted the data into training and testing set.

In [37]:
#open the files
short_pos = open("positive.txt","r").read()
short_neg = open("negative.txt","r").read()


all_words = []
documents = []


#  j is adject, r is adverb, and v is verb

allowed_word_types = ["J"]

for p in short_pos.split('\n'):
    documents.append( (p, "pos") )
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

    
for p in short_neg.split('\n'):
    documents.append( (p, "neg") )
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())



save_documents = open("pickled_algos/pickled_documents.pickle","wb")
pickle.dump(documents, save_documents)
save_documents.close()


all_words = nltk.FreqDist(all_words)


word_features = list(all_words.keys())[:5000]


save_word_features = open("pickled_algos/word_features5k.pickle","wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()


def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

random.shuffle(featuresets)
print(len(featuresets))

testing_set = featuresets[10000:]
training_set = featuresets[:10000]


10664


### Various Classifiers

To train our data using **Sklearn models** we have to convert our the model that sklearn model provides that is somewhat acceptable to operatins done using **nltk** library and therefore each and every model that we have imported using sklearn module has been fitted to a method `SklearnClassifier()` that is provided and can be imported using nltk module.  

It is appreciable to store your models using **pickle** to avoid training multiple times

In [39]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

###############
save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

save_classifier = open("pickled_algos/MNB_classifier5k.pickle","wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

save_classifier = open("pickled_algos/BernoulliNB_classifier5k.pickle","wb")
pickle.dump(BernoulliNB_classifier, save_classifier)
save_classifier.close()

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

save_classifier = open("pickled_algos/LogisticRegression_classifier5k.pickle","wb")
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()


LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

save_classifier = open("pickled_algos/LinearSVC_classifier5k.pickle","wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()

SGDC_classifier = SklearnClassifier(SGDClassifier())
SGDC_classifier.train(training_set)
print("SGDClassifier accuracy percent:",nltk.classify.accuracy(SGDC_classifier, testing_set)*100)

save_classifier = open("pickled_algos/SGDC_classifier5k.pickle","wb")
pickle.dump(SGDC_classifier, save_classifier)
save_classifier.close()

Original Naive Bayes Algo accuracy percent: 73.19277108433735
Most Informative Features
              engrossing = True              pos : neg    =     20.4 : 1.0
               inventive = True              pos : neg    =     15.7 : 1.0
                 generic = True              neg : pos    =     15.6 : 1.0
                mediocre = True              neg : pos    =     15.6 : 1.0
                 routine = True              neg : pos    =     15.6 : 1.0
                    flat = True              neg : pos    =     14.6 : 1.0
              refreshing = True              pos : neg    =     14.4 : 1.0
                    warm = True              pos : neg    =     13.0 : 1.0
                  boring = True              neg : pos    =     12.7 : 1.0
                mindless = True              neg : pos    =     11.6 : 1.0
             mesmerizing = True              pos : neg    =     11.0 : 1.0
               wonderful = True              pos : neg    =     11.0 : 1.0
            

### Create the Voted Classifier

Based upon all the above mentioned classifiers we are going to create a vote classifier that takes in the **category** categorised by vote of majority of classifiers against minority.

In [40]:
voted_classifier = VoteClassifier(
                                  classifier,
                                  LinearSVC_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

In [47]:
nltk.classify.accuracy(voted_classifier,testing_set)*100 # accuracy of voted_classifier

73.04216867469879

### Custom Sentiment of a movie review 

A function that takes in a single review to classify it as **positive** or **negative** using 5 different classifiers out of which the one's in fovour of majority are selected.

In [41]:
def sentiment(text):
    feats = find_features(text)
    print('Classifiers in favour of',voted_classifier.classify(feats),':',voted_classifier.confidence(feats)*5)
    return voted_classifier.classify(feats),voted_classifier.confidence(feats)*100

### Predict Sentiment of movie reviews

In [42]:
sentiment('This movie is awesome.')

Classifiers in favour of pos : 5.0


('pos', 100.0)

In [43]:
sentiment('This movie is crap.')

Classifiers in favour of neg : 5.0


('neg', 100.0)

In [44]:
sentiment('Acting was not upto the mark')

Classifiers in favour of neg : 5.0


('neg', 100.0)

In [45]:
sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10")

Classifiers in favour of neg : 5.0


('neg', 100.0)

In [46]:
sentiment('moderately good')

Classifiers in favour of pos : 3.0


('pos', 60.0)