Following: http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/
Could do something like https://marcobonzanini.com/2015/05/17/mining-twitter-data-with-python-part-6-sentiment-analysis-basics/ but unsupervised sentiment analysis typically doesn't work well

In [1]:
#Import packages
#need to run nltk.download() from command line and get stopwords corupus (freezes in the notebook)

import pandas
import pickle
import os.path
import string
import itertools
import collections

import nltk
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import precision as prec
from nltk.metrics import recall as rec
from nltk.metrics import f_measure as fmeas


In [2]:
#Import the csv dataframe
PoGo_labeled = pandas.read_csv('PoGo_Sentiment_Labeled_extended.csv')
PoGo_labeled.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
PoGo_labeled.head(n=5)

Unnamed: 0,latt,location,long,multi-team,screenName,sentiment,text,userId
0,,"Caldwell, ID",,False,desmond_ayala,pos,Which pokemon go team did y'all chose? #valor,2953472000.0
1,-73.918741,"Brooklyn, NY",40.694338,False,aphrospice,pos,#Magikarp practicing his struggle skills in th...,16290860.0
2,,"Bixby, OK",,False,ABellgowan,pos,Pokemon Go is taking over my life #TeamInstinct,1681036000.0
3,,"Los Angeles, CA",,False,JangoSnow,pos,Go Team Instinct! I like underdogs. :) https:...,10574340.0
4,,"Niagara Falls, NY",,False,EmberLighta2,pos,#TeamMystic has total control of Niagara Falls!!,7.51392e+17


In [4]:
#Making lists of positive and negative tweets
pos_tweets = [(PoGo_labeled.ix[row,'text'],'positive') for row in range(len(PoGo_labeled)) if \
              PoGo_labeled.ix[row,'sentiment'] == 'pos']

neg_tweets = [(PoGo_labeled.ix[row,'text'],'negative') for row in range(len(PoGo_labeled)) if \
              PoGo_labeled.ix[row,'sentiment'] == 'neg']


Could do the following to cut out common words: (found it doesn't help in this case)<br \>
from nltk.corpus import stopwords <br \>
stopset = set(stopwords.words('english')) <br \>


In [5]:
#If we've made one, import top features list
use_top_feature = False

if os.path.isfile('top_features.txt'):
    with open('top_features.txt', 'rb') as f:
        use_top_feature_q=input('Use top feature list in classifier? (yes/no) : ')
        if use_top_feature_q == 'yes':
            use_top_feature=True
        top_feature_list = pickle.load(f)
        


Use top feature list in classifier? (yes/no) : yes


In [6]:
#Exclude words that can identify the team from list of features
exclude = set(string.punctuation)

excluded_words = ['teammystic','mystic','teamblue','blue',\
                  'teaminstinct','instinct','teamyellow','yellow',\
                  'teamvalor','valor','teamred','red']

#filter the tweets to produce a list of features
def filter_tweets(tweets):
    filtered_tweets = []
    
    #Get a list of words, and the sentiment for each tweet
    for (words, sentiment) in tweets: 
        words_filtered=[]
        
        #For each word in the list of words, filter on some requirements.  If it passes, add it to features of that tweet
        for word in words.split(): 
            
            #Remove punctuation
            word = ''.join(ch for ch in word if ch not in exclude)
            
            #Remove one letter words
            if len(word) >= 1: 
                
                    #treat URLs the same
                    if word[:4] == 'http':
                        word='http'
                        
                    #remove hashtags
                    if word[0] == '#': 
                        word=word[1:]
                        
                    #remove team identifiers
                    if (word.lower() not in excluded_words):
        
                        #require lower case
                        words_filtered.append(word.lower()) 

        #Identify top 200 bigams in the filtered word list using chi_sq measure of importance
        bigram_finder = BigramCollocationFinder.from_words(words_filtered)
        bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 200)  
        
        #If we are using the top feature list, require that the unigram and bigrams be in the top_featue list
        if use_top_feature:
            filtered_tweets.append(([ngram for ngram in itertools.chain(words_filtered, bigrams) if str(ngram) in top_feature_list],sentiment))
        else:
            filtered_tweets.append(([ngram for ngram in itertools.chain(words_filtered, bigrams)],sentiment))

    return filtered_tweets

In [7]:
filtered_pos_tweets = filter_tweets(pos_tweets)
filtered_neg_tweets = filter_tweets(neg_tweets)    

In [8]:
#Divide data into training, cross validation, and test sets

#set mode to 'down' for downsampling
#set mode to 'up', for upsampling
#set mode to 'ratio', for an accurate ratio of pos/neg

set_mode = 'down'

total_neg = len(neg_tweets)
total_pos = len(pos_tweets)

if set_mode == 'down':
    #Downsampling the positive tweets
    
    #half the negative tweets go in training
    len_train = int(round(len(filtered_neg_tweets)/2)*2)
    train_tweets = filtered_neg_tweets[:int(len_train/2)] + filtered_pos_tweets[:int(len_train/2)]

    #half of the remaining half go in cv
    cv_neg_cutoff = int( (len_train/2) + round((len(filtered_neg_tweets) - len_train/2)/2) )
    cv_pos_cutoff = int( (len_train/2) + round((len(filtered_pos_tweets) - len_train/2)/2) )
    cv_tweets =  filtered_neg_tweets[int(len_train/2):cv_neg_cutoff] +  filtered_pos_tweets[int(len_train/2):cv_pos_cutoff]  

    #rest go into testing
    test_tweets = filtered_neg_tweets[cv_neg_cutoff:] +  filtered_pos_tweets[cv_pos_cutoff:]  

elif set_mode == 'up':
    #Upsample negative tweets
    
    #half the negative tweets go in training
    neg_scale_factor = 3
    len_train = int(round(len(filtered_neg_tweets)/2)*2)
    train_tweets = filtered_neg_tweets[:int(len_train/2)]*neg_scale_factor + filtered_pos_tweets[:int(neg_scale_factor*len_train/2)]

    #half of the remaining half go in cv
    cv_neg_cutoff = int( (len_train/2) + round((len(filtered_neg_tweets) - len_train/2)/2) )
    cv_pos_cutoff = int( (neg_scale_factor*len_train/2) + round((len(filtered_pos_tweets) - neg_scale_factor*len_train/2)/2) )
    cv_tweets =  filtered_neg_tweets[int(len_train/2):cv_neg_cutoff] +  filtered_pos_tweets[int(neg_scale_factor*len_train/2):cv_pos_cutoff]  

    #rest go into testing
    test_tweets =  filtered_neg_tweets[cv_neg_cutoff:] +  filtered_pos_tweets[cv_pos_cutoff:]  

elif set_mode == 'ratio':
    #True ratio of tweets
    
    #half the tweets go into training
    len_neg_train = int(round(len(filtered_neg_tweets)*0.5))
    len_pos_train = int(round(len(filtered_pos_tweets)*0.5))
    train_tweets = filtered_neg_tweets[:int(len_neg_train/2)] + filtered_pos_tweets[:int(len_pos_train/2)]

    #half of the remaining half go in cv
    cv_neg_cutoff = int( (len_neg_train/2) + round((len(filtered_neg_tweets) - len_neg_train/2)/2) )
    cv_pos_cutoff = int( (len_pos_train/2) + round((len(filtered_pos_tweets) - len_pos_train/2)/2) )
    cv_tweets =  filtered_neg_tweets[int(len_neg_train/2):cv_neg_cutoff] +  filtered_pos_tweets[int(len_pos_train/2):cv_pos_cutoff]  

    #rest go into testing
    test_tweets =  filtered_neg_tweets[cv_neg_cutoff:] +  filtered_pos_tweets[cv_pos_cutoff:]  


In [9]:
#Making a list of all unigrams and bigrams in the tweets

#Function to find all of the words in a tweet
def get_words_in_tweets(tweets):
    all_words = []
    
    for (words, sentiment) in tweets:
      all_words.extend(words)
    
    return all_words

#Function to make a list of features from a list of words
def get_word_features(wordlist,min_freq):

    wordlist = nltk.FreqDist(wordlist)
    sorted_word_list = sorted(wordlist.items(), key=lambda x: x[1], reverse=True)
    word_features = [sorted_word_list[word][0] for word in range(len(sorted_word_list)) if sorted_word_list[word][1] >= min_freq]
    return word_features

word_features = get_word_features(get_words_in_tweets(train_tweets),3)

In [10]:
#Feature extractor - determines which word features are in each tweet
def extract_features(document):

    document_words = set(document)
    features = {}

    for word in word_features:
        features['contains(%s)' % str(word)] = (word in document_words)

    return features


In [11]:
#Extract training/test set from training/test tweets
training_set = nltk.classify.apply_features(extract_features, train_tweets)
cv_set = nltk.classify.apply_features(extract_features, cv_tweets)
test_set = nltk.classify.apply_features(extract_features, test_tweets)

In [12]:
#Train the classifier
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [13]:
#Link to Andrew Ng's video here
#Precision is fraction of samples identified as true that really were true
    # Precision = number of true positive/(number of true pos + number of false positives) 
#Recall is fraction of samples that were correctly identified as true, divided by the total number of true samples
    # Recall = number of true positive/(number of true positives + number of false negatives)
#F1 score is 2* (P*R)/(P+R).  It is used to balance precision and recall


#Type I error = false positive
#Type II error = false negative
#For our purposes, we want to minimize type I errors so no negative tweets sneak in.
#We do this by maximizing recall [negative] so that we correctly identify as many negative events as we can

cross_valid_accuracy = nltk.classify.accuracy(classifier, cv_set)


refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

for i, (feats, label) in enumerate(cv_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

print ('Accuracy:', cross_valid_accuracy)
print ('F-measure [negative]:', fmeas(refsets['negative'], testsets['negative']))
print ('F-measure [positive]:', fmeas(refsets['positive'], testsets['positive']))
print ('Precision [negative]:', prec(refsets['negative'], testsets['negative']))
print ('Precision [positive]:', prec(refsets['positive'], testsets['positive']))
rec_neg=rec(refsets['negative'], testsets['negative'])
rec_pos=rec(refsets['positive'], testsets['positive'])
print ('Recall [negative]:', rec_neg)
print ('Recall [positive]:', rec_pos)
print ('Negative contamination improved by ',100*(1-(1-rec_neg)/(rec_pos)), 'percent')

Accuracy: 0.7083333333333334
F-measure [negative]: 0.2048
F-measure [positive]: 0.8214157384117858
Precision [negative]: 0.11657559198542805
Precision [positive]: 0.9896103896103896
Recall [negative]: 0.8421052631578947
Recall [positive]: 0.702088452088452
Negative contamination improved by  77.51070589860477 percent


We want high negative recall so that we correctly identify and reject negative tweets often.  Ideally, we would have high negative precision as well.  In this case we have low negaive precision, which means we are identifying a lot of tweets as negative when they are not actually negative.  This is okay as long as it is randomly happening. 
However, if we accidentally identify a positive tweet as negative more often for a specific team, then we are skewing the results for that team. - Throw out any team identifiers in the tweets to solve this problem.

Note: If we had high negative precision we could look at which teams are the most hated by looking at the fraction of negative tweets associated with each tam.  However, since we are randomly throwing tweets into the negative category every now and then, it wouldn't work well for that purpose.

In [14]:
#Show the 20 most important features
print (classifier.show_most_informative_features(10))

Most Informative Features
contains(('team', 'is')) = True           negati : positi =     10.3 : 1.0
            contains(at) = True           positi : negati =      9.0 : 1.0
          contains(when) = True           negati : positi =      8.3 : 1.0
          contains(fuck) = True           negati : positi =      7.7 : 1.0
contains(('on', 'team')) = True           negati : positi =      7.0 : 1.0
     contains(pokemongo) = True           positi : negati =      6.5 : 1.0
          contains(they) = True           negati : positi =      5.0 : 1.0
          contains(team) = False          positi : negati =      4.8 : 1.0
         contains(trash) = True           negati : positi =      4.3 : 1.0
contains(('is', 'team')) = True           negati : positi =      4.3 : 1.0
None


In [15]:
#collecting top 150 features
top_features=classifier.most_informative_features(150)

#removing text around each feature
top_features=[top_features[row][0].split('contains(')[1][:-1] for row in range(len(top_features))]

In [16]:
#Save the top features to text for itterative use
do_save = input('Do you want to save the top features?  This will overwrite the old save file (yes/no): ')

if do_save == 'yes':
    if os.path.isfile('top_features.txt'):
        os.remove('top_features.txt')
    with open('top_features.txt', 'wb') as f:
        pickle.dump(top_features, f)

Do you want to save the top features?  This will overwrite the old save file (yes/no): no


<h1> After optimizing for CV set, get statistics for test set </h1>

In [17]:
#Save the classifier for later use
f = open('PoGo_tweet_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

#Save document_words as well
with open('PoGo_classifier_feats.pickle', 'wb') as f:
    pickle.dump(word_features, f)

In [18]:
test_accuracy = nltk.classify.accuracy(classifier, test_set)

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

print ('Accuracy:', cross_valid_accuracy)
print ('F-measure [negative]:', fmeas(refsets['negative'], testsets['negative']))
print ('F-measure [positive]:', fmeas(refsets['positive'], testsets['positive']))
print ('Precision [negative]:', prec(refsets['negative'], testsets['negative']))
print ('Precision [positive]:', prec(refsets['positive'], testsets['positive']))
rec_neg=rec(refsets['negative'], testsets['negative'])
rec_pos=rec(refsets['positive'], testsets['positive'])
print ('Recall [negative]:', rec_neg)
print ('Recall [positive]:', rec_pos)
print ('Negative contamination improved by ',100*(1-(1-rec_neg)/(rec_pos)), 'percent')

Accuracy: 0.7083333333333334
F-measure [negative]: 0.19457735247208932
F-measure [positive]: 0.8184106436533621
Precision [negative]: 0.11070780399274047
Precision [positive]: 0.986990459670425
Recall [negative]: 0.8026315789473685
Recall [positive]: 0.699017199017199
Negative contamination improved by  71.7648691147905 percent


In [19]:
uncut_positives = total_pos*rec_pos
uncut_negatives = total_neg*(1-rec_neg)
contamination_est = uncut_negatives / uncut_positives
print('Estimate of systematic error from uncut negative tweets: ', round(10000*contamination_est)/100, 'percent')

Estimate of systematic error from uncut negative tweets:  2.52 percent


In [20]:
print('Original systematic error from uncut negative tweets was: ', round(10000*(total_neg/total_pos))/100, 'percent')

Original systematic error from uncut negative tweets was:  8.92 percent


In [23]:
print('Systematic error improved by a factor of: ', (total_neg/total_pos)/contamination_est)

Systematic error improved by a factor of:  3.5416871416871425
