In [1]:
import nltk
import random
import pickle
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC,LinearSVC,NuSVC

In [2]:
pos_reviews = open("dataset/positive.txt","r").read()
neg_reviews = open("dataset/negative.txt","r").read()

In [3]:
documents = []
all_words = []

In [4]:
#  j is adject, r is adverb, and v is verb
#allowed_word_types = ["J","R","V"]
allowed_word_types = ["J"]

for p in pos_reviews.split('\n'):
    documents.append( (p, "pos") )
    words = nltk.word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

    
for n in neg_reviews.split('\n'):
    documents.append( (n, "neg") )
    words = nltk.word_tokenize(n)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())


### Saving documents

In [5]:
save_documents = open("pickled_algos/documents.pickle","wb")
pickle.dump(documents,save_documents)
save_documents.close()

In [6]:
all_words = nltk.FreqDist(all_words)

In [7]:
#Total no of review sentences
len(documents)

10662

In [8]:
# Total No of Words
len(all_words.keys())

6185

In [9]:
#Taking first 5000 words
word_features = list(all_words.keys())[:5000] 

### Save word_features

In [10]:
save_word_features = open("pickled_algos/word_features5k.pickle","wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()

In [11]:
def find_features(document,word_features):
    words = set(nltk.word_tokenize(document))
    
    features = {}
    for w in word_features:
        features[w] = (w in words)  # Create mapping with True or False if word present -> true
    return features

In [12]:
featuresets = [ (find_features(rev,word_features),category) for (rev,category) in documents ]

## Saving featuresets

In [13]:
save_featuresets = open("pickled_algos/featuresets.pickle","wb")
pickle.dump(featuresets,save_featuresets)
save_featuresets.close()

In [14]:
random.shuffle(featuresets)

In [15]:
len(featuresets)

10662

In [16]:
training_set = featuresets[:10000]
testing_set = featuresets[10000:]

## Naive Bayes

In [17]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [18]:
print("Original Naive Bayes Algo Accuracy Percent: ",(nltk.classify.accuracy(classifier,testing_set))*100)

Original Naive Bayes Algo Accuracy Percent:  74.16918429003022


In [19]:
classifier.show_most_informative_features(15)

Most Informative Features
                    warm = True              pos : neg    =     21.8 : 1.0
                    flat = True              neg : pos    =     20.9 : 1.0
              engrossing = True              pos : neg    =     20.5 : 1.0
                  boring = True              neg : pos    =     19.3 : 1.0
                mediocre = True              neg : pos    =     16.2 : 1.0
               inventive = True              pos : neg    =     15.8 : 1.0
                 routine = True              neg : pos    =     15.6 : 1.0
              unexpected = True              pos : neg    =     15.1 : 1.0
                 generic = True              neg : pos    =     14.2 : 1.0
              refreshing = True              pos : neg    =     13.8 : 1.0
               affecting = True              pos : neg    =     12.4 : 1.0
               wonderful = True              pos : neg    =     11.9 : 1.0
               realistic = True              pos : neg    =     11.7 : 1.0

### Saving Naive Bayes Classifier

In [20]:
save_classifier = open("pickled_algos/naive_bayes.pickle","wb")
pickle.dump(classifier,save_classifier)
save_classifier.close()

## MNB

In [21]:
MNB_classifier = SklearnClassifier(MultinomialNB())

In [22]:
MNB_classifier.train(training_set)

<SklearnClassifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))>

In [23]:
print("MNB Algo Accuracy Percent: ",(nltk.classify.accuracy(MNB_classifier,testing_set))*100)

MNB Algo Accuracy Percent:  73.71601208459214


### Saving MNB Classifier

In [24]:
save_mnb_classifier = open("pickled_algos/MNB.pickle","wb")
pickle.dump(MNB_classifier,save_mnb_classifier)
save_mnb_classifier.close()

## Bernoulli 

In [25]:
Bernoulli_classifier = SklearnClassifier(BernoulliNB())

In [26]:
Bernoulli_classifier.train(training_set)

<SklearnClassifier(BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))>

In [27]:
print("Bernoulli Algo Accuracy Percent: ", (nltk.classify.accuracy(Bernoulli_classifier,testing_set))*100)

Bernoulli Algo Accuracy Percent:  74.32024169184291


### Saving Bernoulli

In [28]:
save_bernoulli_classifier = open("pickled_algos/Bernoulli.pickle","wb")
pickle.dump(Bernoulli_classifier,save_bernoulli_classifier)
save_bernoulli_classifier.close()

## Logistic

In [29]:
Logistic_classifier = SklearnClassifier(LogisticRegression())

In [30]:
Logistic_classifier.train(training_set)

<SklearnClassifier(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))>

In [31]:
print("Logistic Algo Accuracy Percent: ", (nltk.classify.accuracy(Logistic_classifier,testing_set))*100)

Logistic Algo Accuracy Percent:  71.75226586102718


### Saving Logistic 

In [32]:
save_logistic_classifier = open("pickled_algos/Logistic.pickle","wb")
pickle.dump(Logistic_classifier,save_logistic_classifier)
save_logistic_classifier.close()

## SGD 

In [33]:
SGD_classifier = SklearnClassifier(SGDClassifier())

In [34]:
SGD_classifier.train(training_set)



<SklearnClassifier(SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))>

In [35]:
print("SGD classifier accuracy percent: ",(nltk.classify.accuracy(SGD_classifier,testing_set))*100)

SGD classifier accuracy percent:  70.54380664652568


### Saving SGD

In [36]:
save_sgd_classifier = open("pickled_algos/SGD.pickle","wb")
pickle.dump(SGD_classifier,save_sgd_classifier)
save_sgd_classifier.close()

## SVC

In [37]:
SVC_classifier = SklearnClassifier(SVC())

In [38]:
SVC_classifier.train(training_set)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))>

In [39]:
print("SVC Classifier accuracy percent: ",(nltk.classify.accuracy(SVC_classifier,testing_set))*100)

SVC Classifier accuracy percent:  47.583081570996974


### Saving SVC

In [40]:
save_svc_classifier = open("pickled_algos/SVC.pickle","wb")
pickle.dump(SVC_classifier,save_svc_classifier)
save_svc_classifier.close()

## Linear SVC

In [41]:
Linear_SVC_classifier = SklearnClassifier(LinearSVC())

In [42]:
Linear_SVC_classifier.train(training_set)

<SklearnClassifier(LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))>

In [43]:
print("Linear SVC Classifier accuracy percent: ",(nltk.classify.accuracy(Linear_SVC_classifier,testing_set))*100)

Linear SVC Classifier accuracy percent:  69.33534743202418


### Saving Linear SVC

In [44]:
save_linear_svc_classifier = open("pickled_algos/Linear_SVC.pickle","wb")
pickle.dump(Linear_SVC_classifier,save_linear_svc_classifier)
save_linear_svc_classifier.close()

## NuSVC

In [45]:
Nu_SVC_classifier = SklearnClassifier(NuSVC())

In [46]:
Nu_SVC_classifier.train(training_set)

<SklearnClassifier(NuSVC(cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
   max_iter=-1, nu=0.5, probability=False, random_state=None,
   shrinking=True, tol=0.001, verbose=False))>

In [47]:
print("Nu SVC Classifier accuracy percent: ",(nltk.classify.accuracy(Nu_SVC_classifier,testing_set))*100)

Nu SVC Classifier accuracy percent:  71.75226586102718


### Saving NuSVC

In [48]:
save_nu_svc_classifier = open("pickled_algos/Nu_SVC.pickle","wb")
pickle.dump(Nu_SVC_classifier,save_nu_svc_classifier)
save_nu_svc_classifier.close()

## Combining All Algos

In [49]:
from nltk.classify import ClassifierI
from statistics import mode

In [50]:
class VoteClassifier(ClassifierI):
    def __init__(self,*classifiers):
        self._classifiers = classifiers
    
    def classify(self,features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
       
        return mode(votes)
    
    def confidence(self,features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
            
        choice_votes = votes.count(mode(votes))
        
        conf = choice_votes / len(votes)
        return (conf * 100)

In [51]:
voted_classifier = VoteClassifier(classifier,
                                  Nu_SVC_classifier,
                                  Linear_SVC_classifier,
                                  SGD_classifier,
                                  MNB_classifier,
                                  Bernoulli_classifier,
                                  Logistic_classifier)

In [52]:
print("Voted Classifier accuracy percent: ",(nltk.classify.accuracy(voted_classifier,testing_set))*100)

Voted Classifier accuracy percent:  73.71601208459214


In [53]:
print("Classification: ",voted_classifier.classify(testing_set[234][0]),"Confidence % :",voted_classifier.confidence(testing_set[234][0]))

Classification:  neg Confidence % : 100.0


In [54]:
def sentiment(text,word_features,voted_classifier):
    feats = find_features(text,word_features)
    return voted_classifier.classify(feats),voted_classifier.confidence(feats)