In [1]:
import nltk

In [2]:
nltk.download(["names","stopwords","state_union","twitter_samples","movie_reviews","averaged_perceptron_tagger","vader_lexicon","punkt",])

[nltk_data] Downloading package names to /Users/quratul-
[nltk_data]     ainmahesar/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/quratul-
[nltk_data]     ainmahesar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to /Users/quratul-
[nltk_data]     ainmahesar/nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package twitter_samples to /Users/quratul-
[nltk_data]     ainmahesar/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package movie_reviews to /Users/quratul-
[nltk_data]     ainmahesar/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/quratul-ainmahesar/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       da

True

In [3]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [4]:
sia = SentimentIntensityAnalyzer()

In [5]:
sia.polarity_scores("Wow, NLTK is really powerful!")

{'neg': 0.0, 'neu': 0.295, 'pos': 0.705, 'compound': 0.8012}

In [23]:
tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings()]

In [24]:
from random import shuffle

def is_positive(tweet: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise."""
    return sia.polarity_scores(tweet)["compound"] > 0

shuffle(tweets)
for tweet in tweets[:10]:
    print(">", is_positive(tweet), tweet)

> False RT @SkyNews: Why won't David Cameron debate with Ed Miliband directly? #GE2015  http//t.co/n5H9t7LmuO
> True @davidc79 thx! And Android support in by Xmas :)
> True @adnanCad Planted people are allowed to do any thing and every thing :p
> True RT @MartinSLewis: Miliband has grown in presentational confidence enormously - he had more humour and warmth than Cameron. #bbcqt
> False @bellacaledonia trouble is, Ed didnt call it a "Vow" to never work with Snp because (as we all know) that would be meaningless &amp; breakable
> False RT @ShropshireStar: Now Ukip member’s campaign posters defaced in #Ludlow campaign http//t.co/uoigIaikK2 http//t.co/fP51GOPF0v
> True why does everyone like my voice i sound like ed miliband choking on sandwich
> False @MrsPiszczek nein :(
> True RT @NicolaSturgeon: If Miliband is going to let Tories in rather than work with SNP, we will definitely need lots of SNP MPs to protect Sco…
> False VIDEO: 'I will never apologise' for coalition: Lib Dem leader N

In [25]:
positive_review_ids = nltk.corpus.movie_reviews.fileids(categories=["pos"])
negative_review_ids = nltk.corpus.movie_reviews.fileids(categories=["neg"])
all_review_ids = positive_review_ids + negative_review_ids

In [26]:
from statistics import mean

def is_positive(review_id: str) -> bool:
    """True if the average of all sentence compound scores is positive."""
    text = nltk.corpus.movie_reviews.raw(review_id)
    scores = [
        sia.polarity_scores(sentence)["compound"]
        for sentence in nltk.sent_tokenize(text)
    ]
    return mean(scores) > 0

In [27]:
shuffle(all_review_ids)
correct = 0
for review_id in all_review_ids:
     if is_positive(review_id):
        if review_id in positive_review_ids:
            correct += 1
        else:
             if review_id in negative_review_ids:
                correct += 1

print(F"{correct / len(all_review_ids):.2%} correct")



69.10% correct


In [28]:
unwanted = nltk.corpus.stopwords.words("english")

unwanted.extend([w.lower() for w in nltk.corpus.names.words()])


def skip_unwanted(pos_tuple):

    word, tag = pos_tuple

    if not word.isalpha() or word in unwanted:

        return False

    if tag.startswith("NN"):

        return False

    return True


positive_words = [word for word, tag in filter(

    skip_unwanted,

    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["pos"]))

)]

negative_words = [word for word, tag in filter(

    skip_unwanted,

    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["neg"]))

)]

In [29]:
positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)

common_set = set(positive_fd).intersection(negative_fd)

for word in common_set:
    del positive_fd[word]
    del negative_fd[word]

top_100_positive = {word for word, count in positive_fd.most_common(100)}
top_100_negative = {word for word, count in negative_fd.most_common(100)}


In [30]:
def extract_features(text):
    features = dict()
    wordcount = 0
    compound_scores = list()
    positive_scores = list()

    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            if word.lower() in top_100_positive:
                wordcount += 1
        compound_scores.append(sia.polarity_scores(sentence)["compound"])
        positive_scores.append(sia.polarity_scores(sentence)["pos"])

    # Adding 1 to the final compound score to always have positive numbers
    # since some classifiers you'll use later don't work with negative numbers.
    features["mean_compound"] = mean(compound_scores) + 1
    features["mean_positive"] = mean(positive_scores)
    features["wordcount"] = wordcount

    return features

In [31]:
features = [
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "pos")
    for review in nltk.corpus.movie_reviews.fileids(categories=["pos"])
]
features.extend([
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "neg")
    for review in nltk.corpus.movie_reviews.fileids(categories=["neg"])
])

In [32]:
train_count = len(features) // 4
shuffle(features)
classifier = nltk.NaiveBayesClassifier.train(features[:train_count])
classifier.show_most_informative_features(10)
nltk.classify.accuracy(classifier, features[train_count:])

Most Informative Features
               wordcount = 3                 pos : neg    =      6.7 : 1.0
               wordcount = 2                 pos : neg    =      3.5 : 1.0
               wordcount = 0                 neg : pos    =      1.5 : 1.0
               wordcount = 1                 pos : neg    =      1.1 : 1.0
               wordcount = 4                 pos : neg    =      1.1 : 1.0
           mean_positive = 0.10585714285714286    pos : neg    =      1.0 : 1.0
           mean_positive = 0.12558333333333332    pos : neg    =      1.0 : 1.0


0.6753333333333333