In [54]:
import nltk
import random
import pickle
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC,LinearSVC,NuSVC

In [34]:
pos_reviews = open("positive.txt","r").read()
neg_reviews = open("negative.txt","r").read()

documents = []

for r in pos_reviews.split("\n"):
    documents.append( (r,"pos") )
    
for r in neg_reviews.split("\n"):
    documents.append( (r,"neg") )
    

In [35]:
all_words = []

pos_words = nltk.word_tokenize(pos_reviews)
neg_words = nltk.word_tokenize(neg_reviews)

for w in pos_words:
    all_words.append(w.lower())


for w in neg_words:
    all_words.append(w.lower())

In [36]:
all_words = nltk.FreqDist(all_words)

In [48]:
#Total no of review sentences
len(documents)

10662

In [37]:
# Total No of Words
len(all_words.keys())

20321

In [38]:
#Taking first 5000 words
word_features = list(all_words.keys())[:5000] 

In [41]:
def find_features(document,word_features):
    words = set(nltk.word_tokenize(document))
    
    features = {}
    for w in word_features:
        features[w] = (w in words)  # Create mapping with True or False if word present -> true
    return features

In [42]:
featuresets = [ (find_features(rev,word_features),category) for (rev,category) in documents ]

In [44]:
random.shuffle(featuresets)

In [45]:
len(featuresets)

10662

In [49]:
training_set = featuresets[:10000]
testing_set = featuresets[10000:]

## Naive Bayes

In [50]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [51]:
print("Original Naive Bayes Algo Accuracy Percent: ",(nltk.classify.accuracy(classifier,testing_set))*100)

Original Naive Bayes Algo Accuracy Percent:  71.29909365558912


In [52]:
classifier.show_most_informative_features(15)

Most Informative Features
              engrossing = True              pos : neg    =     20.9 : 1.0
               inventive = True              pos : neg    =     14.3 : 1.0
              refreshing = True              pos : neg    =     13.6 : 1.0
                    warm = True              pos : neg    =     13.0 : 1.0
              disturbing = True              pos : neg    =     12.3 : 1.0
               wonderful = True              pos : neg    =     12.2 : 1.0
             mesmerizing = True              pos : neg    =     11.6 : 1.0
                  beauty = True              pos : neg    =     11.4 : 1.0
                provides = True              pos : neg    =     11.4 : 1.0
                captures = True              pos : neg    =     11.4 : 1.0
            refreshingly = True              pos : neg    =     11.0 : 1.0
                powerful = True              pos : neg    =     10.3 : 1.0
                    ages = True              pos : neg    =     10.3 : 1.0

### Saving Naive Bayes Classifier

In [55]:
save_classifier = open("naive_bayes.pickle","wb")
pickle.dump(classifier,save_classifier)
save_classifier.close()

## MNB

In [56]:
MNB_classifier = SklearnClassifier(MultinomialNB())

In [57]:
MNB_classifier.train(training_set)

<SklearnClassifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))>

In [58]:
print("MNB Algo Accuracy Percent: ",(nltk.classify.accuracy(MNB_classifier,testing_set))*100)

MNB Algo Accuracy Percent:  70.09063444108762


### Saving MNB Classifier

In [59]:
save_mnb_classifier = open("MNB.pickle","wb")
pickle.dump(MNB_classifier,save_mnb_classifier)
save_mnb_classifier.close()

## Bernoulli 

In [60]:
Bernoulli_classifier = SklearnClassifier(BernoulliNB())

In [61]:
Bernoulli_classifier.train(training_set)

<SklearnClassifier(BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))>

In [62]:
print("Bernoulli Algo Accuracy Percent: ", (nltk.classify.accuracy(Bernoulli_classifier,testing_set))*100)

Bernoulli Algo Accuracy Percent:  72.65861027190333


### Saving Bernoulli

In [63]:
save_bernoulli_classifier = open("Bernoulli.pickle","wb")
pickle.dump(Bernoulli_classifier,save_bernoulli_classifier)
save_bernoulli_classifier.close()

## Logistic

In [64]:
Logistic_classifier = SklearnClassifier(LogisticRegression())

In [65]:
Logistic_classifier.train(training_set)

<SklearnClassifier(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))>

In [66]:
print("Logistic Algo Accuracy Percent: ", (nltk.classify.accuracy(Logistic_classifier,testing_set))*100)

Logistic Algo Accuracy Percent:  73.41389728096676


### Saving Logistic 

In [67]:
save_logistic_classifier = open("Logistic.pickle","wb")
pickle.dump(Logistic_classifier,save_logistic_classifier)
save_logistic_classifier.close()

## SGD 