## Combining Algos with a Vote

In [32]:
import nltk
import random
from nltk.corpus import movie_reviews

from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode


Class for voting which algo has higher accuracy, and confidence in the given answer.

In [33]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers) -> None:
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [34]:
documents = [(list(movie_reviews.words(fileid)), category)
                for category in movie_reviews.categories()
                for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

### Frequency Distribution of words

In [35]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]

Extracting features from words

In [36]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [37]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

### Training Set and Testing Sets

In [38]:
training_set = featuresets[:1900]
testing_set = featuresets[1900:]

OG Naive Bayes classifier

In [39]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("OG Naive Bayes algorithm accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set)) * 100)
classifier.show_most_informative_features(15)

OG Naive Bayes algorithm accuracy percent:  87.0
Most Informative Features
               atrocious = True              neg : pos    =     11.1 : 1.0
                   sucks = True              neg : pos    =     10.3 : 1.0
                  annual = True              pos : neg    =      8.9 : 1.0
           unimaginative = True              neg : pos    =      8.4 : 1.0
                 frances = True              pos : neg    =      7.6 : 1.0
                   groan = True              neg : pos    =      7.1 : 1.0
                    mena = True              neg : pos    =      7.1 : 1.0
              schumacher = True              neg : pos    =      7.1 : 1.0
                  shoddy = True              neg : pos    =      7.1 : 1.0
             silverstone = True              neg : pos    =      7.1 : 1.0
                  suvari = True              neg : pos    =      7.1 : 1.0
                 idiotic = True              neg : pos    =      6.4 : 1.0
                  kombat 

1. MultinomialNB

In [40]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)

print("MNB_Classifier accuracy percent: ", (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)

MNB_Classifier accuracy percent:  88.0


2. BernoulliNB

In [41]:
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)

print("BNB_classifier accuracy percent: ", (nltk.classify.accuracy(BNB_classifier, testing_set)) * 100)

BNB_classifier accuracy percent:  87.0


### Classifiers other than Naive Bayes

3. Logistic Regression

In [42]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression(max_iter=10000))
LogisticRegression_classifier.train(training_set)

print("LogisticRegression_Classifier accuracy percent: ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100)

LogisticRegression_Classifier accuracy percent:  79.0


4. SGDClassifier - Stochastic Gradient Descent learning

In [43]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)

print("SGDClassifier_classifier accuracy percent: ", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set)) * 100)

SGDClassifier_classifier accuracy percent:  81.0


5. LinearSVC

In [44]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)

print("LinearSVC_classifier accuracy percent: ", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)

LinearSVC_classifier accuracy percent:  80.0


6. NuSVC 

In [45]:
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)

print("NuSVC_classifier accuracy percent: ", (nltk.classify.accuracy(NuSVC_classifier, testing_set)) * 100)

NuSVC_classifier accuracy percent:  80.0


Checks algo and confidence level.

In [46]:
voted_classifier = VoteClassifier(classifier, MNB_classifier, LogisticRegression_classifier, SGDClassifier_classifier, NuSVC_classifier, LinearSVC_classifier)
print("voted_classifier accuracy percent: ", (nltk.classify.accuracy(voted_classifier, testing_set)) * 100)
print("Classification: ", voted_classifier.classify(testing_set[0][0]), "\nConfidence % :", voted_classifier.confidence(testing_set[0][0])*100)
print("Classification: ", voted_classifier.classify(testing_set[1][0]), "\nConfidence % :", voted_classifier.confidence(testing_set[1][0])*100)
print("Classification: ", voted_classifier.classify(testing_set[2][0]), "\nConfidence % :", voted_classifier.confidence(testing_set[2][0])*100)
print("Classification: ", voted_classifier.classify(testing_set[3][0]), "\nConfidence % :", voted_classifier.confidence(testing_set[3][0])*100)
print("Classification: ", voted_classifier.classify(testing_set[4][0]), "\nConfidence % :", voted_classifier.confidence(testing_set[4][0])*100)
print("Classification: ", voted_classifier.classify(testing_set[5][0]), "\nConfidence % :", voted_classifier.confidence(testing_set[5][0])*100)

voted_classifier accuracy percent:  83.0
Classification:  pos 
Confidence % : 50.0
Classification:  neg 
Confidence % : 100.0
Classification:  neg 
Confidence % : 100.0
Classification:  neg 
Confidence % : 66.66666666666666
Classification:  neg 
Confidence % : 100.0
Classification:  neg 
Confidence % : 100.0
