## Lesson 10: text classification

In [1]:
import nltk.data
import random
import nltk
from nltk.corpus import movie_reviews

nltk.data.path.append(r'C:\Users\dehaeth\Documents\Tools\nltk')

We will create a text classifier for sentiment analysis

### Get the data

In [2]:
documents = [(list(movie_reviews.words(fileid)),category) 
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

### Shuffle the data
Because it is too neatly organized in categories

In [3]:
random.shuffle(documents)

### Normalize the data

In [4]:
all_words = [w.lower() for w in movie_reviews.words()]
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


There doesn't seem to be anything in this list that really matters, we will correct this later. By contrast: how many times does the word 'stupid' appear:

In [5]:
print(all_words["stupid"])

253


### Turn the words into features

In [6]:
# Get the top 3000 words
word_features = list(all_words.keys())[:3000]

In [7]:
def find_features(document):
    # every single words
    words = set(document)
    
    features={}
    
    for w in word_features:
        features[w] = (w in words)
        
    return features


In [8]:
import numpy as np
featuresets = [(find_features(review),category) for review,category in documents]
print(np.array(featuresets)[:,0][0])



### Split data, train and test with NLTK

In [9]:
training_set = featuresets[:1900]
test_set = featuresets[1900:]

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayer score: ", nltk.classify.accuracy(classifier,test_set))
classifier.show_most_informative_features(15)

Naive Bayer score:  0.78
Most Informative Features
                   sucks = True              neg : pos    =     16.1 : 1.0
                  annual = True              pos : neg    =      9.8 : 1.0
                  justin = True              neg : pos    =      9.5 : 1.0
                 idiotic = True              neg : pos    =      9.5 : 1.0
                 frances = True              pos : neg    =      8.4 : 1.0
           unimaginative = True              neg : pos    =      7.6 : 1.0
             silverstone = True              neg : pos    =      7.6 : 1.0
                    lame = True              neg : pos    =      6.9 : 1.0
                  suvari = True              neg : pos    =      6.9 : 1.0
                    mena = True              neg : pos    =      6.9 : 1.0
               atrocious = True              neg : pos    =      6.9 : 1.0
                  regard = True              pos : neg    =      6.7 : 1.0
              schumacher = True              neg 

### Save the classifier

In [10]:
import pickle

save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier,save_classifier)
save_classifier.close()

### Use Scikit-learn: Naive Bayes

In [11]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB classifier accuracy: ", (nltk.classify.accuracy(MNB_classifier, test_set)))

BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BNB classifier accuracy: ", (nltk.classify.accuracy(BNB_classifier, test_set)))

MNB classifier accuracy:  0.8
BNB classifier accuracy:  0.77


### Using Scikit-learn: other algorithms

In [12]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

LR_classifier = SklearnClassifier(LogisticRegression())
LR_classifier.train(training_set)
print("Log Reg classifier accuracy: ", (nltk.classify.accuracy(LR_classifier, test_set)))

SGDC_classifier = SklearnClassifier(SGDClassifier())
SGDC_classifier.train(training_set)
print("SGDC classifier accuracy: ", (nltk.classify.accuracy(SGDC_classifier, test_set)))

LSVC_classifier = SklearnClassifier(LinearSVC())
LSVC_classifier.train(training_set)
print("LSVC classifier accuracy: ", (nltk.classify.accuracy(LSVC_classifier, test_set)))

NSVC_classifier = SklearnClassifier(NuSVC())
NSVC_classifier.train(training_set)
print("NSVC classifier accuracy: ", (nltk.classify.accuracy(NSVC_classifier, test_set)))

Log Reg classifier accuracy:  0.8
SGDC classifier accuracy:  0.78
LSVC classifier accuracy:  0.78
NSVC classifier accuracy:  0.82


### Making a voting algo to combine all outcomes
Rather simple one, each algo gets one vote; the outcome with the most vote becomes the final one.
We can introduce a confidence parameter however: 
- 100% of votes = easy of course
- if 4/7 said something = low confidence

In [13]:
from nltk.classify import ClassifierI
from statistics import mode #most votes? Just take the mode

In [14]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self,features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self,features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [32]:
voted_classifier = VoteClassifier(classifier,
                                  NSVC_classifier,
                                  LSVC_classifier,
                                  SGDC_classifier,
                                  MNB_classifier,
                                  BNB_classifier,
                                  LR_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, test_set))*100)

print("Classification:", voted_classifier.classify(test_set[0][0]), "Confidence %:",voted_classifier.confidence(test_set[0][0])*100)
print("Classification:", voted_classifier.classify(test_set[1][0]), "Confidence %:",voted_classifier.confidence(test_set[1][0])*100)
print("Classification:", voted_classifier.classify(test_set[2][0]), "Confidence %:",voted_classifier.confidence(test_set[2][0])*100)
print("Classification:", voted_classifier.classify(test_set[3][0]), "Confidence %:",voted_classifier.confidence(test_set[3][0])*100)
print("Classification:", voted_classifier.classify(test_set[4][0]), "Confidence %:",voted_classifier.confidence(test_set[4][0])*100)
print("Classification:", voted_classifier.classify(test_set[5][0]), "Confidence %:",voted_classifier.confidence(test_set[5][0])*100)

voted_classifier accuracy percent: 78.0
Classification: pos Confidence %: 57.14285714285714
Classification: pos Confidence %: 71.42857142857143
Classification: pos Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 85.71428571428571


### Applying it to real twitter data

First: get every document:

In [16]:
pos_reviews = open("positive.txt",'r').read()
neg_reviews = open("negative.txt",'r').read()

documents = []

for r in pos_reviews.split('\n'):
    documents.append((r,"pos"))
    
for r in neg_reviews.split('\n'):
    documents.append((r,"neg"))

Next: get every word

In [17]:
all_words = []

short_pos_words = nltk.word_tokenize(pos_reviews)
short_neg_words = nltk.word_tokenize(neg_reviews)

for w in short_pos_words:
    all_words.append(w.lower())
    
for w in short_neg_words:
    all_words.append(w.lower())

Get the distribution:

In [39]:
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:10000]

Convert to features:

In [40]:
def find_features(document):
    # every single words
    words = nltk.word_tokenize(document)
    
    features={}
    
    for w in word_features:
        features[w] = (w in words)
        
    return features


featuresets = [(find_features(rev), category) for (rev, category) in documents]
random.shuffle(featuresets)

Split data:

In [42]:
training_set = featuresets[:10000]
testing_set =  featuresets[10000:]

Now re-use the classifier algo's (sorry for the copy-paste)

In [43]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(training_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)


voted_classifier_orig = VoteClassifier(classifier,
                                       SGDClassifier_classifier,
                                      NuSVC_classifier,
                                      LinearSVC_classifier,
                                      MNB_classifier,
                                      BernoulliNB_classifier,
                                      LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

Original Naive Bayes Algo accuracy percent: 70.93373493975903
Most Informative Features
               wonderful = True              pos : neg    =     19.7 : 1.0
              engrossing = True              pos : neg    =     19.0 : 1.0
               inventive = True              pos : neg    =     15.7 : 1.0
                mediocre = True              neg : pos    =     14.3 : 1.0
                  boring = True              neg : pos    =     13.5 : 1.0
              refreshing = True              pos : neg    =     13.0 : 1.0
            refreshingly = True              pos : neg    =     12.4 : 1.0
                      90 = True              neg : pos    =     12.3 : 1.0
                    warm = True              pos : neg    =     12.2 : 1.0
               affecting = True              pos : neg    =     11.7 : 1.0
                   vivid = True              pos : neg    =     11.7 : 1.0
                provides = True              pos : neg    =     11.4 : 1.0
            

Save it for later use

In [22]:
import pickle

save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier,save_classifier)
save_classifier.close()

save_classifier = open("multinomial.pickle","wb")
pickle.dump(classifier,save_classifier)
save_classifier.close()

save_classifier = open("bernoulli.pickle","wb")
pickle.dump(classifier,save_classifier)
save_classifier.close()

save_classifier = open("logreg.pickle","wb")
pickle.dump(classifier,save_classifier)
save_classifier.close()

save_classifier = open("sgd.pickle","wb")
pickle.dump(classifier,save_classifier)
save_classifier.close()

save_classifier = open("linearsvc.pickle","wb")
pickle.dump(classifier,save_classifier)
save_classifier.close()

save_classifier = open("nusvc.pickle","wb")
pickle.dump(classifier,save_classifier)
save_classifier.close()

In [23]:
import pickle

open_file = open("naivebayes.pickle", "rb")
NaiveBayesClassifier = pickle.load(open_file)
open_file.close()

open_file = open("multinomial.pickle", "rb")
MNB_classifier = pickle.load(open_file)
open_file.close()

open_file = open("bernoulli.pickle", "rb")
BernoulliNB_classifier = pickle.load(open_file)
open_file.close()

open_file = open("logreg.pickle", "rb")
LogisticRegression_classifier = pickle.load(open_file)
open_file.close()

open_file = open("sgd.pickle", "rb")
SGDClassifier_classifier = pickle.load(open_file)
open_file.close()

open_file = open("linearsvc.pickle", "rb")
LinearSVC_classifier = pickle.load(open_file)
open_file.close()

open_file = open("nusvc.pickle", "rb")
NuSVC_classifier = pickle.load(open_file)
open_file.close()

voted_classifier = VoteClassifier(
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

### Create a module for later use

In [34]:
def sentiment(text):
    feats = find_features(text)
    return voted_classifier.classify(feats),voted_classifier.confidence(feats)

In [37]:
print(sentiment("jesus, this crap movies sucks donkey balls"))
print(sentiment("is movie was engrossing! The acting was inventive, plot was wonderful!"))

('neg', 0.5714285714285714)
('neg', 0.7142857142857143)


### Get twitter data

In [30]:
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json


#consumer key, consumer secret, access token, access secret.
ckey="Zyh4l2cTYDBl9PnZB1lh1PcXh"
csecret="b5Fa9mZGs7Cq5SwS77YNuL5EzFx4ytFGBVunZUqVAAO8T8Iu5u"
atoken="367641281-2CWCMNQVirWB22GTSFpeaPdqe6BjxwUYt4E2mCoq"
asecret="hfU5ufVzaD15oLQfO9API6Xi1MyZBko6RYGlWfarYjieR"

class listener(StreamListener):

    def on_data(self, data):
        all_data = json.loads(data)

        tweet = all_data["text"]
        sentiment_value, confidence = sentiment(tweet)
        print(tweet, sentiment_value, confidence)

        if confidence*100 >= 80:
            output = open("twitter-out.txt","a")
            output.write(sentiment_value)
            output.write('\n')
            output.close()

        return True

    def on_error(self, status):
        print( status)

auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)

twitterStream = Stream(auth, listener())
twitterStream.filter(track=["happy"])

RT @1_uswnt: Happy birthday to the flyest Georgia peach to ever do it, @kohara19. Have a good one, KO. Live it up. 🔥🤙🏼 
(SI Phot…  neg 1.0
RT @Zendaya: Happy birthday to the flyest to ever do it👏🏽✊🏽 #ObamaDay @barackobama https://t.co/iIkyuyhRzw neg 1.0
RT @AlannaCoops: GOTTA BRING THIS BACK FOR DAD'S BIRTHDAY!!! IMYSM HAPPY BIRTHDAY MR. PRESIDENT #ObamaDay https://t.co/jRNLl6BR1T neg 1.0
Happy birthday yohan iloveyou https://t.co/DOWCFKyo5v neg 1.0
Don't Do These Mistakes to Be Happy https://t.co/Sq4692c27z https://t.co/sRfTTTrRNW neg 1.0
RT @common: "We are the change that we seek" Barack Obama. Happy Birthday Mr. President and Happy #ObamaDay to everyone! https://t.co/RSvwI… neg 1.0
RT @itsdougthepug: Happy Birthday to the daddy of all daddies @colesprouse 🍔 https://t.co/YGDODM4S84 neg 1.0
RT @sonetwt: from "fighting" to singing their 10th anniversary title track, let's look forward to 10 more years, happy anniversa…  pos 1.0
@Zorak_Trump @rmayemsinger @Disney You are welcome to clo

KeyError: 'text'

### Graph everything

In [None]:
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib import style
import time

style.use("ggplot")

fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)

def animate(i):
    pullData = open("twitter-out.txt","r").read()
    lines = pullData.split('\n')

    xar = []
    yar = []

    x = 0
    y = 0

    for l in lines[-200:]:
        x += 1
        if "pos" in l:
            y += 1
        elif "neg" in l:
            y -= 1

        xar.append(x)
        yar.append(y)
        
    ax1.clear()
    ax1.plot(xar,yar)
ani = animation.FuncAnimation(fig, animate, interval=1000)
plt.show()