In [1]:
import nltk
import random
from nltk.corpus import movie_reviews

# Dataset
documents = []
# List of tuples (words, category)
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)), category))

In [2]:
documents[0]

(['plot',
  ':',
  'two',
  'teen',
  'couples',
  'go',
  'to',
  'a',
  'church',
  'party',
  ',',
  'drink',
  'and',
  'then',
  'drive',
  '.',
  'they',
  'get',
  'into',
  'an',
  'accident',
  '.',
  'one',
  'of',
  'the',
  'guys',
  'dies',
  ',',
  'but',
  'his',
  'girlfriend',
  'continues',
  'to',
  'see',
  'him',
  'in',
  'her',
  'life',
  ',',
  'and',
  'has',
  'nightmares',
  '.',
  'what',
  "'",
  's',
  'the',
  'deal',
  '?',
  'watch',
  'the',
  'movie',
  'and',
  '"',
  'sorta',
  '"',
  'find',
  'out',
  '.',
  '.',
  '.',
  'critique',
  ':',
  'a',
  'mind',
  '-',
  'fuck',
  'movie',
  'for',
  'the',
  'teen',
  'generation',
  'that',
  'touches',
  'on',
  'a',
  'very',
  'cool',
  'idea',
  ',',
  'but',
  'presents',
  'it',
  'in',
  'a',
  'very',
  'bad',
  'package',
  '.',
  'which',
  'is',
  'what',
  'makes',
  'this',
  'review',
  'an',
  'even',
  'harder',
  'one',
  'to',
  'write',
  ',',
  'since',
  'i',
  'generally',
  'a

In [3]:
# Shuffle the dataset
# It is commented because it is needed to investigate the bias
# random.shuffle(documents)

In [4]:
all_words = []
for word in movie_reviews.words():
    all_words.append(word.lower())

In [5]:
all_words = nltk.FreqDist(all_words)

print(all_words.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [6]:
# Number of appearence
print(all_words["stupid"])

253


### Words as Features for Learning

In [7]:
wordFeatures = list(all_words.keys())[:3000]

In [8]:
def findFeatures(document):
    words = set(document)
    
    features = {}
    for word in wordFeatures:
        features[word] = (word in words)
        
    return features

In [9]:
print(findFeatures(movie_reviews.words('neg/cv000_29416.txt')))

featureSets = [(findFeatures(rev), category) for (rev, category) in documents]



### Naive Bayes

In [10]:
# Prepare datasets
trainingSet = featureSets[:1900]
testingSet = featureSets[1900:]

In [11]:
# Training
classifierNaivBay = nltk.NaiveBayesClassifier.train(trainingSet)

In [12]:
# Testing against the test dataset
print("Naive Bayes Algorithm Accuracy Percent:", (nltk.classify.accuracy(classifierNaivBay, testingSet)) * 100)

Naive Bayes Algorithm Accuracy Percent: 77.0


In [13]:
# Print the most informative features
classifierNaivBay.show_most_informative_features(15)

Most Informative Features
                 idiotic = True              neg : pos    =     12.1 : 1.0
                  annual = True              pos : neg    =     10.7 : 1.0
               atrocious = True              neg : pos    =     10.5 : 1.0
                   sucks = True              neg : pos    =      9.5 : 1.0
                 frances = True              pos : neg    =      9.3 : 1.0
           unimaginative = True              neg : pos    =      7.5 : 1.0
                 cunning = True              pos : neg    =      7.0 : 1.0
                  sexist = True              neg : pos    =      6.9 : 1.0
             silverstone = True              neg : pos    =      6.9 : 1.0
                  regard = True              pos : neg    =      6.9 : 1.0
              schumacher = True              neg : pos    =      6.7 : 1.0
                    mena = True              neg : pos    =      6.3 : 1.0
                  shoddy = True              neg : pos    =      6.3 : 1.0

### Save Classifier with Pickle

In [14]:
import pickle

In [15]:
# To save the classifier
saveClassifier = open("naiveBayes.pickle", "wb")
pickle.dump(classifierNaivBay, saveClassifier)
saveClassifier.close()

In [16]:
# To load a classifier
classifierFile = open("naiveBayes.pickle", "rb")
classifierNaivBay = pickle.load(classifierFile)
classifierFile.close()

In [17]:
# Testing against the test dataset with the loaded classifier
print("Naive Bayes Algorithm Accuracy Percent:", (nltk.classify.accuracy(classifierNaivBay, testingSet)) * 100)

Naive Bayes Algorithm Accuracy Percent: 77.0


### Scikit-Learn incorporation

In [18]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [19]:
# Testing against the test dataset
print("Original NLTK Naive Bayes Algorithm Accuracy Percent:", (nltk.classify.accuracy(classifierNaivBay, testingSet)) * 100)

Original NLTK Naive Bayes Algorithm Accuracy Percent: 77.0


In [20]:
# Train Model
MNBClassifier = SklearnClassifier(MultinomialNB())
MNBClassifier.train(trainingSet)

# Testing against the test dataset
print("MultinomialNB Algorithm Accuracy Percent:", (nltk.classify.accuracy(MNBClassifier, testingSet)) * 100)

MultinomialNB Algorithm Accuracy Percent: 77.0


In [21]:
# Train Model
# GNBClassifier = SklearnClassifier(GaussianNB())
# GNBClassifier.train(trainingSet)

# Testing against the test dataset
# print("GaussianNB Algorithm Accuracy Percent:", (nltk.classify.accuracy(GNBClassifier, testingSet)) * 100)

In [22]:
# Train Model
BNBClassifier = SklearnClassifier(BernoulliNB())
BNBClassifier.train(trainingSet)

# Testing against the test dataset
print("BernoulliNB Algorithm Accuracy Percent:", (nltk.classify.accuracy(BNBClassifier, testingSet)) * 100)

BernoulliNB Algorithm Accuracy Percent: 77.0


In [23]:
# Train Model
LogisticRegressionClassifier = SklearnClassifier(LogisticRegression())
LogisticRegressionClassifier.train(trainingSet)

# Testing against the test dataset
print("LogisticRegressionClassifier Algorithm Accuracy Percent:", (nltk.classify.accuracy(LogisticRegressionClassifier, testingSet)) * 100)

LogisticRegressionClassifier Algorithm Accuracy Percent: 80.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
# Train Model
SGDClassifierClassifier = SklearnClassifier(SGDClassifier())
SGDClassifierClassifier.train(trainingSet)

# Testing against the test dataset
print("SGDClassifierClassifier Algorithm Accuracy Percent:", (nltk.classify.accuracy(SGDClassifierClassifier, testingSet)) * 100)

SGDClassifierClassifier Algorithm Accuracy Percent: 83.0


In [25]:
# Train Model
# SVCClassifier = SklearnClassifier(SVC())
# SVCClassifier.train(trainingSet)

# Testing against the test dataset
# print("SVCClassifier Algorithm Accuracy Percent:", (nltk.classify.accuracy(SVC, testingSet)) * 100)

In [26]:
# Train Model
LinearSVCClassifier = SklearnClassifier(LinearSVC())
LinearSVCClassifier.train(trainingSet)

# Testing against the test dataset
print("LinearSVCClassifier Algorithm Accuracy Percent:", (nltk.classify.accuracy(LinearSVCClassifier, testingSet)) * 100)

LinearSVCClassifier Algorithm Accuracy Percent: 81.0


In [27]:
# Train Model
# NuSVCClassifier = SklearnClassifier(NuSVC())
# NuSVCClassifier.train(trainingSet)

# Testing against the test dataset
# print("NuSVC Algorithm Accuracy Percent:", (nltk.classify.accuracy(NuSVC, testingSet)) * 100)

### Combining Algos with a Vote 

In [28]:
from nltk.classify import ClassifierI
from statistics import mode

In [29]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        """ 
        Constructor method that saves the received list 
        of classifier algorithms
        """
        self._classifiers = classifiers

    def classify(self, features):
        """
        Method in charge of classifying with all the algorithms,
        saving their ouput and getting the mode as if it were a vote.
        """
        votesList = []
        for classifie in self._classifiers:
            v = classifie.classify(features)
            votesList.append(v)
        return mode(votesList)

    def confidence(self, features):
        """
        Returns the result of dividing the amount of algorithms that
        return the mode value divided by the total number of classifiers.
        """
        votesList = []
        for c in self._classifiers:
            v = c.classify(features)
            votesList.append(v)

        choiceVotes = votesList.count(mode(votesList))
        confValue = choiceVotes / len(votesList)
        return confValue

In [30]:
# Build the Vote Classifier
votedClassifier = VoteClassifier(classifierNaivBay, MNBClassifier, BNBClassifier,
                                LogisticRegressionClassifier, SGDClassifierClassifier, LinearSVCClassifier)

In [31]:
print("Voted Classifier Algorithm Accuracy Percent:", (nltk.classify.accuracy(votedClassifier, testingSet)) * 100)

Voted Classifier Algorithm Accuracy Percent: 75.0


In [32]:
print("Classification: ", votedClassifier.classify(testingSet[0][0]), "Confidence %: ", votedClassifier.confidence(testingSet[0][0])*100)
print("Classification: ", votedClassifier.classify(testingSet[1][0]), "Confidence %: ", votedClassifier.confidence(testingSet[1][0])*100)
print("Classification: ", votedClassifier.classify(testingSet[3][0]), "Confidence %: ", votedClassifier.confidence(testingSet[3][0])*100)
print("Classification: ", votedClassifier.classify(testingSet[4][0]), "Confidence %: ", votedClassifier.confidence(testingSet[4][0])*100)
print("Classification: ", votedClassifier.classify(testingSet[5][0]), "Confidence %: ", votedClassifier.confidence(testingSet[5][0])*100)
print("Classification: ", votedClassifier.classify(testingSet[6][0]), "Confidence %: ", votedClassifier.confidence(testingSet[6][0])*100)
print("Classification: ", votedClassifier.classify(testingSet[7][0]), "Confidence %: ", votedClassifier.confidence(testingSet[7][0])*100)
print("Classification: ", votedClassifier.classify(testingSet[8][0]), "Confidence %: ", votedClassifier.confidence(testingSet[8][0])*100)
print("Classification: ", votedClassifier.classify(testingSet[9][0]), "Confidence %: ", votedClassifier.confidence(testingSet[9][0])*100)
print("Classification: ", votedClassifier.classify(testingSet[10][0]), "Confidence %: ", votedClassifier.confidence(testingSet[10][0])*100)
print("Classification: ", votedClassifier.classify(testingSet[11][0]), "Confidence %: ", votedClassifier.confidence(testingSet[11][0])*100)
print("Classification: ", votedClassifier.classify(testingSet[12][0]), "Confidence %: ", votedClassifier.confidence(testingSet[12][0])*100)
print("Classification: ", votedClassifier.classify(testingSet[14][0]), "Confidence %: ", votedClassifier.confidence(testingSet[14][0])*100)

Classification:  pos Confidence %:  100.0
Classification:  pos Confidence %:  100.0
Classification:  pos Confidence %:  100.0
Classification:  pos Confidence %:  66.66666666666666
Classification:  pos Confidence %:  100.0
Classification:  pos Confidence %:  100.0
Classification:  pos Confidence %:  100.0
Classification:  pos Confidence %:  100.0
Classification:  neg Confidence %:  50.0
Classification:  pos Confidence %:  100.0
Classification:  neg Confidence %:  100.0
Classification:  pos Confidence %:  100.0
Classification:  neg Confidence %:  83.33333333333334


### Investigating Bias