In [80]:
import nltk
import random
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize

In [81]:
documents_f = open("pickled_algos/documents.pickle", "rb")
documents = pickle.load(documents_f)
documents_f.close()

In [82]:
word_features5k_f = open("pickled_algos/word_features5k.pickle", "rb")
word_features = pickle.load(word_features5k_f)
word_features5k_f.close()

In [83]:
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [84]:
featuresets_f = open("pickled_algos/featuresets.pickle", "rb")
featuresets = pickle.load(featuresets_f)
featuresets_f.close()

In [85]:
# random.shuffle(featuresets)

In [86]:
training_set = featuresets[:10000]
testing_set = featuresets[10000:]

## Naive Bayes

### loading Pickle 

In [87]:
open_file = open("pickled_algos/naive_bayes.pickle", "rb")
classifier = pickle.load(open_file)
open_file.close()

In [88]:
print("Original Naive Bayes Algo Accuracy Percent: ",(nltk.classify.accuracy(classifier,testing_set))*100)

Original Naive Bayes Algo Accuracy Percent:  72.9607250755287


In [89]:
classifier.show_most_informative_features(15)

Most Informative Features
              engrossing = True              pos : neg    =     20.9 : 1.0
               wonderful = True              pos : neg    =     20.2 : 1.0
                provides = True              pos : neg    =     17.6 : 1.0
               inventive = True              pos : neg    =     15.6 : 1.0
              refreshing = True              pos : neg    =     13.6 : 1.0
                    warm = True              pos : neg    =     12.9 : 1.0
             mesmerizing = True              pos : neg    =     11.6 : 1.0
            refreshingly = True              pos : neg    =     11.6 : 1.0
                  beauty = True              pos : neg    =     10.9 : 1.0
                captures = True              pos : neg    =     10.9 : 1.0
               realistic = True              pos : neg    =     10.9 : 1.0
                delicate = True              pos : neg    =     10.3 : 1.0
                    ages = True              pos : neg    =     10.3 : 1.0

## MNB

### loading Pickle

In [90]:
open_file = open("pickled_algos/MNB.pickle", "rb")
MNB_classifier = pickle.load(open_file)
open_file.close()

In [91]:
print("MNB Algo Accuracy Percent: ",(nltk.classify.accuracy(MNB_classifier,testing_set))*100)

MNB Algo Accuracy Percent:  68.12688821752266


## Bernoulli

### loading Pickle

In [92]:
open_file = open("pickled_algos/Bernoulli.pickle", "rb")
Bernoulli_classifier = pickle.load(open_file)
open_file.close()

In [93]:
print("Bernoulli Algo Accuracy Percent: ", (nltk.classify.accuracy(Bernoulli_classifier,testing_set))*100)

Bernoulli Algo Accuracy Percent:  68.12688821752266


## Logistic

### loading Pickle

In [94]:
open_file = open("pickled_algos/Logistic.pickle", "rb")
Logistic_classifier = pickle.load(open_file)
open_file.close()

In [95]:
print("Logistic Algo Accuracy Percent: ", (nltk.classify.accuracy(Logistic_classifier,testing_set))*100)

Logistic Algo Accuracy Percent:  72.50755287009063


## SGD

### loading Pickle

In [96]:
open_file = open("pickled_algos/SGD.pickle", "rb")
SGD_classifier = pickle.load(open_file)
open_file.close()

In [97]:
print("SGD classifier accuracy percent: ",(nltk.classify.accuracy(SGD_classifier,testing_set))*100)

SGD classifier accuracy percent:  70.99697885196375


## SVC

### loading Pickle

In [98]:
open_file = open("pickled_algos/SVC.pickle", "rb")
SVC_classifier = pickle.load(open_file)
open_file.close()

In [99]:
print("SVC Classifier accuracy percent: ",(nltk.classify.accuracy(SVC_classifier,testing_set))*100)

SVC Classifier accuracy percent:  50.30211480362537


## Linear SVC

### loading Pickle

In [100]:
open_file = open("pickled_algos/Linear_SVC.pickle", "rb")
Linear_SVC_classifier = pickle.load(open_file)
open_file.close()

In [101]:
print("Linear SVC Classifier accuracy percent: ",(nltk.classify.accuracy(Linear_SVC_classifier,testing_set))*100)

Linear SVC Classifier accuracy percent:  72.50755287009063


## NuSVC

### loading Pickle

In [102]:
open_file = open("pickled_algos/Nu_SVC.pickle", "rb")
Nu_SVC_classifier = pickle.load(open_file)
open_file.close()

In [103]:
print("Nu SVC Classifier accuracy percent: ",(nltk.classify.accuracy(Nu_SVC_classifier,testing_set))*100)

Nu SVC Classifier accuracy percent:  71.6012084592145


## Combining All models

In [104]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [105]:
voted_classifier = VoteClassifier(
                                  classifier,
                                  LinearSVC_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

In [106]:
def sentiment(text):
    feats = find_features(text)
    return voted_classifier.classify(feats),voted_classifier.confidence(feats)

In [107]:
sentiment("This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!")

('pos', 1.0)

In [108]:
sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10")

('neg', 1.0)