In [30]:
import nltk
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import random
import pickle

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

print(type(movie_reviews))
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

<class 'nltk.corpus.reader.plaintext.CategorizedPlaintextCorpusReader'>


In [32]:
print(type(documents))
print(len(documents))
print(documents[:2])

<class 'list'>
2000
[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'ar

In [None]:
# random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
# print(len(all_words)) # Length: 39,768
# print(all_words.most_common(15))
# print(all_words["good"])
words_features = list(all_words.keys())[:3000]
def find_features(document):
    words = set(document)
    features = {}
    for w in words_features:
        features[w] = (w in words)
    return features
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_features(rev),category) for (rev, category) in documents]

In [5]:
print(type(featuresets))

<class 'list'>


In [18]:
### Investigating Bias
# pos data
training_set = featuresets[:1900]
testing_set = featuresets[1900:]
# neg data
training_set = featuresets[100:]
testing_set = featuresets[:100]

In [22]:
# training set text (length = all_words aka vocabulary length)
print(training_set[0][0])



In [None]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
# classifier_f = open("naivebayes.pickle", "rb")
# classifier = pickle.load(classifier_f)
# classifier_f.close()
print("Original Naive Bayes Algo accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100)
# classifier.show_most_informative_features(15)
# save_classifier = open("naivebayes.pickle", "wb")
# pickle.dump(classifier, save_classifier)
# save_classifier.close()

MNB_clf = SklearnClassifier(MultinomialNB())
MNB_clf.train(training_set)
print("Multinomial Naive Bayes Algo accuracy percent: ", (nltk.classify.accuracy(MNB_clf, testing_set))*100)

# GaussianNB, BernoulliNB
# GaussianNB = SklearnClassifier(GaussianNB())
# GaussianNB.train(training_set)
# print("GaussianNB Algo accuracy percent: ", (nltk.classify.accuracy(GaussianNB, testing_set))*100)

BernoulliNB = SklearnClassifier(BernoulliNB())
BernoulliNB.train(training_set)
print("BernoulliNB Algo accuracy percent: ", (nltk.classify.accuracy(BernoulliNB, testing_set))*100)
# Multinomial Naive Bayes Algo accuracy percent:  79.0
# BernoulliNB Algo accuracy percent:  80.0

# LogisticRegression, SGDClassifier
LogisticRegression = SklearnClassifier(LogisticRegression())
LogisticRegression.train(training_set)
print("LogisticRegression Algo accuracy percent: ", (nltk.classify.accuracy(LogisticRegression, testing_set))*100)

SGDClassifier = SklearnClassifier(SGDClassifier())
SGDClassifier.train(training_set)
print("SGDClassifier Algo accuracy percent: ", (nltk.classify.accuracy(SGDClassifier, testing_set))*100)

# SVC, LinearSVC, NuSVC
SVC = SklearnClassifier(SVC())
SVC.train(training_set)
print("SVC Algo accuracy percent: ", (nltk.classify.accuracy(SVC, testing_set))*100)

LinearSVC = SklearnClassifier(LinearSVC())
LinearSVC.train(training_set)
print("LinearSVC Algo accuracy percent: ", (nltk.classify.accuracy(LinearSVC, testing_set))*100)

NuSVC = SklearnClassifier(NuSVC())
NuSVC.train(training_set)
print("NuSVC Algo accuracy percent: ", (nltk.classify.accuracy(NuSVC, testing_set))*100)

# LogisticRegression Algo accuracy percent:  76.0
# SGDClassifier Algo accuracy percent:  76.0
# SVC Algo accuracy percent:  78.0
# LinearSVC Algo accuracy percent:  76.0
# NuSVC Algo accuracy percent:  79.0

# Don't have an even amount of classifiers otherwise: possibility of same amount of pos and neg -> Statistics error
voted_classifier = VoteClassifier(MNB_clf,
                                  BernoulliNB,
                                  LogisticRegression,
                                  SGDClassifier,
                                  SVC,
                                  LinearSVC,
                                  NuSVC)
print("voted_classifier accuracy percent: ", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification: ", voted_classifier.classify(testing_set[0][0]), "Confidence %: ", voted_classifier.confidence(testing_set[0][0])*100)

In [None]:
import nltk
# nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words('english'))
print(stop_words)

example_sentence = "This is an example showing off stop word filtering but we need conjunctions."
words = word_tokenize(example_sentence)
filtered_sentence = [w for w in words if not w in stop_words]
print(filtered_sentence)

ex1 = "I didn't know, You are, He is, She is, We are, They are"
ps = PorterStemmer()
words1 = word_tokenize(ex1)
for w in words1:
    print(ps.stem(w))

from nltk.corpus import wordnet
syns = wordnet.synsets("telephone")
print(syns)
synonyms = []
for syn in wordnet.synsets("telephone"):
    for l in syn.lemmas():
        synonyms.append(l.name())
print(set(synonyms))
# --> can only use phone and telehone
# get average of phrases and words
# build lexicon with all words in them

In [36]:
from nltk import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem import PorterStemmer
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
# print(text[0])
# words = word_tokenize(text[0])
# print(words)
# # stemmed = [ps.stem(w) for w in words if not w in stop_words]
# stemmed = [lemmatizer.lemmatize(w) for w in words if not w in stop_words]
# str1 = TreebankWordDetokenizer().detokenize(stemmed)
# print(stemmed)
# print(str1)
# print(target[0])
print(lemmatizer.lemmatize("important"))
print(lemmatizer.lemmatize("importance"))
print(lemmatizer.lemmatize("wolves"))
print(lemmatizer.lemmatize("better"))
print(lemmatizer.lemmatize("his"))
print(lemmatizer.lemmatize("Mr."))
print(lemmatizer.lemmatize("Mr"))
print(lemmatizer.lemmatize("Mister"))
print(lemmatizer.lemmatize("Us"))
print(lemmatizer.lemmatize("us"))
print(lemmatizer.lemmatize("you're"))
print(ps.stem("powerd"))

important
importance
wolf
better
his
Mr.
Mr
Mister
Us
u
you're
powerd


In [None]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
print(reviews_train_clean[0])
words = word_tokenize(reviews_train_clean[0])
stemmed = [ps.stem(w) for w in words]
str1 = TreebankWordDetokenizer().detokenize(stemmed)
print(stemmed)
print(str1)
print(y_train[0])
