## Part 3 
### NLTK Sentiment Analyzer - Twitter Samples

In [167]:
!pip install nltk
import nltk
nltk.download([
"names",
"stopwords",
"twitter_samples",
"averaged_perceptron_tagger",
"vader_lexicon",
"punkt",
])

import nltk
from nltk.corpus import twitter_samples
twitter_samples.fileids()



[nltk_data] Downloading package names to /Users/BGBlanco/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/BGBlanco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/BGBlanco/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/BGBlanco/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/BGBlanco/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/BGBlanco/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [168]:
# load with inactive URLs
positive_tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings('positive_tweets.json')]
negative_tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings('negative_tweets.json')]
all_tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings('tweets.20150430-223406.json')]
combined_tweets = positive_tweets + negative_tweets + all_tweets

In [169]:
## load with active URLs
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')
posneg_tweets = twitter_samples.strings('tweets.20150430-223406.json')
all_posneg_tweets = pos_tweets + neg_tweets + neu_tweets

<div style="background-color: red;">
 <p style="color: white">
  ** CLEANING TWEETS **
 </p>
</div>

#### Assigning special characters like <span style='background :yellow' > emojis </span> 
- set happy and sad emojis <br>
- combine with **Union** function

In [170]:
import string
import re

from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

from nltk.tokenize import TweetTokenizer

# happy emojis
happy_emoji = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

# Sad emojis
sad_emoji = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

# all emojis (happy + sad)
emojis = happy_emoji.union(sad_emoji)

#### Cleaning Tweets
- <font color=blue> **def**</font> - define a function <br>
- <font color=blue> **re.sub()**</font> - replace patterns in strings <br>
- <font color=blue> **str()**</font> - to avoid error if function expects integer or float format <br>
    - tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet) --> if function doesn't require string

#### Tokenizing Tweets
- **TweetTokenizer** - module used to tokenize tweets <br>
- <font color=blue> **preserve_case**</font> if False then converts tweet to lowercase <br>
- <font color=blue> **strip_handles**</font> if True then removes twitter handles from the tweet <br>
- <font color=blue> **reduce_len**</font> if True then reduce length of words in the a tweet like 'yaaaahooooo' <br>


#### Items to clean
- stock tickers ($APPL), retweet (RT), hyperlinks, hashtags <br>
- stopwords, emojis, punctuations, stem words

In [222]:
def clean_it(tweet):
    # remove stock tickers $APPL
    tweet = re.sub(r'\$\w*', '', str(tweet))
    
    # remove retweet "RT"
    tweet = re.sub(r'^RT[\s]+', '', str(tweet))

    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*♛', '', str(tweet))

     # remove more hyperlinks
    tweet = re.sub(r'http.', '', str(tweet))
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', str(tweet))

    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
              word not in emojis and # remove emoticons
                word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [223]:
Test_Tweet = """RT @geraldblanco13 Religion ends where philosophy begins, just as alchemy ends where 
chemistry begins, and astrology ends where astronomy begins. ~ from the great Christopher Hitchens! 
#enlightened :) https://christopherhitchens.net/"""

# print cleaned tweet

print('--------- Test Tweet ---------')
print (clean_it(Test_Tweet))

print('--------- Uncleaned Tweet ---------')
print (positive_tweets[5])

print('--------- Cleaned Tweet ---------')
print (clean_it(positive_tweets[5]))

--------- Test Tweet ---------
['religion', 'end', 'philosophi', 'begin', 'alchemi', 'end', 'chemistri', 'begin', 'astrolog', 'end', 'astronomi', 'begin', 'great', 'christoph', 'hitchen', 'enlighten', ':/', 'christopherhitchens.net/']
--------- Uncleaned Tweet ---------
@BhaktisBanter @PallaviRuhail This one is irresistible :)
#FlipkartFashionFriday http//t.co/EbZ0L2VENM
--------- Cleaned Tweet ---------
['one', 'irresist', 'flipkartfashionfriday', 't.co/ebz0l2venm']


In [224]:
clean_pos_tweets = clean_it(positive_tweets)
clean_neg_tweets = clean_it(negative_tweets)

#### Feature Extraction
- <font color=blue> **dict**</font> - creates an unordered, changeable, and indexed dictionary <br>
- <font color=blue> **append()**</font> - add items to end of the list <br>
- <font color=blue> **return**</font> - return value from created function <br>

- define function on bag_of_words to extract unigram features from tweets

In [225]:
# feature extractor function
def bag_of_words(tweet):
    words = clean_it(tweet)
    words_dictionary = dict([word, True] for word in words)	 
    return words_dictionary

print (bag_of_words(Test_Tweet))

{'religion': True, 'end': True, 'philosophi': True, 'begin': True, 'alchemi': True, 'chemistri': True, 'astrolog': True, 'astronomi': True, 'great': True, 'christoph': True, 'hitchen': True, 'enlighten': True, ':/': True, 'christopherhitchens.net/': True}


In [244]:
# positive tweets feature set
positive_tweets_set = []
for tweet in positive_tweets:
    positive_tweets_set.append((bag_of_words(tweet), 'pos'))

# negative tweets feature set
negative_tweets_set = []
for tweet in negative_tweets:
    negative_tweets_set.append((bag_of_words(tweet), 'neg'))

print (len(positive_tweets_set), len(negative_tweets_set))

5000 5000


#### Train and Test Set
- <font color=blue> **Test Set**</font> - [:1000] for 1000 samples <br>
- <font color=blue> **Train Set**</font> - [1000:] means remaining samples after deducting 1000 <br>

In [245]:
# randomize positive_tweet_set and negative_tweets_set

from random import shuffle 
shuffle(positive_tweets_set)
shuffle(negative_tweets_set)

test_set = positive_tweets_set[:1000] + negative_tweets_set[:1000]
train_set = positive_tweets_set[1000:] + negative_tweets_set[1000:]

print(len(test_set),  len(train_set))

2000 8000


#### Classifier Training and Accuracy  
<font color=blue> **NaiveBayesClassifier**</font> - paramaterized by 2 probability distribution <br>
- P(label) probablity that an input will receive each label, given no info about its features <br>
- P(fname=fval|label) probability that a given feature (fname) will receive a given value (fval), given its label

References:
- https://www.nltk.org/book/ch06.html
- https://monkeylearn.com/blog/practical-explanation-naive-bayes-classifier/



In [246]:
from nltk import classify
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_set)
accuracy = classify.accuracy(classifier, test_set)

print(accuracy)
print(classifier.show_most_informative_features(10))

0.7435
Most Informative Features
                     bam = True              pos : neg    =     25.7 : 1.0
                     sad = True              neg : pos    =     20.8 : 1.0
                     x15 = True              neg : pos    =     20.3 : 1.0
                 appreci = True              pos : neg    =     17.7 : 1.0
                 retweet = True              pos : neg    =     15.0 : 1.0
                     ugh = True              neg : pos    =     15.0 : 1.0
                    glad = True              pos : neg    =     13.4 : 1.0
                 congrat = True              pos : neg    =     13.0 : 1.0
                 definit = True              pos : neg    =     13.0 : 1.0
                     via = True              pos : neg    =     13.0 : 1.0
None


<font color=blue> **How to Interpret** </font> NaiveBayesClassifier <br>
- **'show_most_informative_features()'** specifies the top features that are most effective in classifying pos or neg
-  **0.7435** - the % of tweets correctly classified
- Example
> - **'sad'** in the training set is TRUE (it's in the bag_of_words)
> - **'sad'** appeared in 'neg' or negative_tweets 20.8 times that in 'pos' or positive_tweets

<font color=red>**The sets are randomly shuffled, so results will change**</font>

#### Testing Classifier

In [240]:
# Can the classifier correctly classify this negative tweet?
test_tweet = "@JustinTrudeau when are you going to crucify the b@astards who killed all those children?!?"
test_tweet_set = bag_of_words(test_tweet)
print (classifier.classify(test_tweet_set))

neg


In [247]:
# Can the classifier correctly classify this positive tweet?
test_tweet = "Happy Canada day y'all!!!"
test_tweet_set = bag_of_words(test_tweet)
print (classifier.classify(test_tweet_set))

pos


### scikit-learn Classifier

In [254]:
!pip install scikit-learn
from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis



In [255]:
classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
    "AdaBoostClassifier": AdaBoostClassifier(),
}

In [258]:
from random import shuffle 
shuffle(positive_tweets_set)
shuffle(negative_tweets_set)

for name, sklearn_classifier in classifiers.items():
         classifier = nltk.classify.SklearnClassifier(sklearn_classifier)
         classifier.train(train_set)
         accuracy = nltk.classify.accuracy(classifier, test_set)
         print(F"{accuracy:.2%} - {name}")

73.90% - BernoulliNB
74.25% - ComplementNB
74.25% - MultinomialNB
65.45% - KNeighborsClassifier
68.90% - DecisionTreeClassifier
72.20% - RandomForestClassifier
73.50% - LogisticRegression
68.85% - MLPClassifier
67.15% - AdaBoostClassifier


## Others

### Precision, Recall, F1-Score, Confusion Matrix
<font color=red>**EXPLORE FURTHER**</font>
https://blog.chapagain.com.np/python-nltk-twitter-sentiment-analysis-natural-language-processing-nlp/

In [253]:
from collections import defaultdict

actual_set = defaultdict(set)
predicted_set = defaultdict(set)

actual_set_cm = []
predicted_set_cm = []

for index, (feature, actual_label) in enumerate(test_set):
    actual_set[actual_label].add(index)
    actual_set_cm.append(actual_label)

    predicted_label = classifier.classify(feature)

    predicted_set[predicted_label].add(index)
    predicted_set_cm.append(predicted_label)
    
from nltk.metrics import precision, recall, f_measure, ConfusionMatrix

print ('Precision - Positive:', precision(actual_set['pos'], predicted_set['pos']))
print ('Recall - Positive:', recall(actual_set['pos'], predicted_set['pos']))  
print ('F-measure - Positive:', f_measure(actual_set['pos'], predicted_set['pos']))  

print ('Precision - Negative:', precision(actual_set['neg'], predicted_set['neg']))  
print ('Recall - Negative:', recall(actual_set['neg'], predicted_set['neg']))  
print ('F-measure - Negative:', f_measure(actual_set['neg'], predicted_set['neg']))

cm = ConfusionMatrix(actual_set_cm, predicted_set_cm)
print (cm)

Precision - Positive: 0.7321258341277407
Recall - Positive: 0.768
F-measure - Positive: 0.7496339677891655
Precision - Negative: 0.7560462670872765
Recall - Negative: 0.719
F-measure - Negative: 0.7370579190158891
    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<719>281 |
pos | 232<768>|
----+---------+
(row = reference; col = test)



### Frequency Distribution

In [227]:
positive_fd = nltk.FreqDist(clean_pos_tweets)
negative_fd = nltk.FreqDist(clean_neg_tweets)
common_set = set(positive_fd).intersection(negative_fd)

for word in common_set:
    del positive_fd[word]
    del negative_fd[word]
    
top_100_pos = {word for word, count in positive_fd.most_common(100)}
top_100_neg = {word for word, count in negative_fd.most_common(100)}

### Setting up bigram

In [228]:
# Set up positive and negative bigram finders

positive_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(clean_pos_tweets)
negative_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(clean_neg_tweets)
print('--------- Positive Collocation ---------')
positive_bigram_finder.ngram_fd.most_common(5)

--------- Positive Collocation ---------


[(('new', 'follow'), 65),
 (('follow', 't.co/rcvcyyo0iq'), 62),
 (('t.co/rcvcyyo0iq', 'follow'), 62),
 (('follow', 'u'), 62),
 (('u', 'back'), 62)]

In [229]:
print('--------- Negative Collocation ---------')
negative_bigram_finder.ngram_fd.most_common(5)

--------- Negative Collocation ---------


[(('♛', '♛'), 141),
 (('》', '》'), 140),
 (('n', '》'), 70),
 (('pleas', 'follow'), 61),
 (('thank', 'n'), 52)]

### Part of Speech

In [None]:
from statistics import mean

def is_positive(tweets: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise."""
    text = nltk.corpus.twitter_samples.raw(tweets)
    scores = [
        sia.polarity_scores(sentence)["compound"]
        for sentence in nltk.sent_tokenize(text)
    ]
    return mean(scores) > 0

In [124]:
# test
test_pos = twitter_samples.tokenized('positive_tweets.json')[0] #1st string
print(test_pos[0]) # first word
print(test_pos[1]) # 2nd word
print(test_pos[2]) # 3rd word

#FollowFriday
@France_Inte
@PKuchly57


In [125]:
# test
test_pos = twitter_samples.tokenized('positive_tweets.json')[1] #2nd string
print(test_pos[0]) # first word
print(test_pos[1]) # 2nd word
print(test_pos[2]) # 3rd word

@Lamb2ja
Hey
James


In [126]:
# test
test_pos = twitter_samples.tokenized('positive_tweets.json') # no specified string sets
print(test_pos[0]) # 1st tweet
print(test_pos[1]) # 2nd tweet
print(test_pos[2]) # 3rd tweet

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['@Lamb2ja', 'Hey', 'James', '!', 'How', 'odd', ':/', 'Please', 'call', 'our', 'Contact', 'Centre', 'on', '02392441234', 'and', 'we', 'will', 'be', 'able', 'to', 'assist', 'you', ':)', 'Many', 'thanks', '!']
['@DespiteOfficial', 'we', 'had', 'a', 'listen', 'last', 'night', ':)', 'As', 'You', 'Bleed', 'is', 'an', 'amazing', 'track', '.', 'When', 'are', 'you', 'in', 'Scotland', '?', '!']


In [127]:
nltk.download('wordnet') # determines the base word

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/BGBlanco/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [128]:
# Part of Speech Tag
from nltk.tag import pos_tag
print(pos_tag(test_pos[0]))

# JJ - adjective; NNP - proper noun; IN - preposition or conjunction; VBG - verb, part participle; 
# NNS - plural noun; NN - singular noun; PRP$ - possive noun

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


In [129]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

from random import shuffle

def is_positive(tweets: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise."""
    return sia.polarity_scores(tweets)["compound"] > 0

shuffle(combined_tweets)
for tweets in combined_tweets[:2]:
    print(">", is_positive(tweets), tweets)

> False Ed Miliband is a tool
> True @dylanobrien @MazeRunnerMovie IT IS JUST BEYOND WORDS, sooooooo freaking good! I'm going to die before it comes out!! :D


In [116]:
from statistics import mean

def is_positive(tweets: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise."""
    text = nltk.corpus.twitter_samples.raw(tweets)
    scores = [
        sia.polarity_scores(sentence)["compound"]
        for sentence in nltk.sent_tokenize(text)
    ]
    return mean(scores) > 0

In [130]:
shuffle(combined_tweets)
correct = 0

for tweets in combined_tweets:
    if is_positive(tweets):
        if tweets in positive_tweets:
            correct +=1
    else:
        if tweets in negative_tweets:
            correct +=1
            
print(F"{correct / len(combined_tweets):.2%} correct")

28.56% correct
