In [1]:
import nltk
from pprint import pprint

In [2]:
words = [w for w in nltk.corpus.state_union.words() if w.isalpha()]

In [3]:
stopwords = nltk.corpus.stopwords.words("english")

In [4]:
words =[w for w in words if w.lower() not in stopwords]

In [7]:
fd = nltk.FreqDist(words)

In [8]:
fd.most_common(3)

[('must', 1568), ('people', 1291), ('world', 1128)]

In [9]:
fd.tabulate(3)

  must people  world 
  1568   1291   1128 


In [13]:
text = nltk.Text(nltk.corpus.state_union.words())
text.concordance("america", lines=5)

Displaying 5 of 1079 matches:
 would want us to do . That is what America will do . So much blood has already
ay , the entire world is looking to America for enlightened leadership to peace
beyond any shadow of a doubt , that America will continue the fight for freedom
 to make complete victory certain , America will never become a party to any pl
nly in law and in justice . Here in America , we have labored long and hard to 


In [16]:
words = [w for w in nltk.corpus.state_union.words() if w.isalpha()]
finder = nltk.collocations.TrigramCollocationFinder.from_words(words)

In [20]:
finder.ngram_fd.most_common(2)
finder.ngram_fd.tabulate(2)

  ('the', 'United', 'States') ('the', 'American', 'people') 
                          294                           185 


In [25]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("Wow, NLTK is reall powerful!")
sia.polarity_scores("today was great!")

{'neg': 0.0, 'neu': 0.313, 'pos': 0.687, 'compound': 0.6588}

In [26]:
tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings()]

In [28]:
from random import shuffle

def is_positive(tweet: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise."""
    return sia.polarity_scores(tweet)["compound"] > 0

shuffle(tweets)
for tweet in tweets[:10]:
    print(">", is_positive(tweet),'::::SPACER::::', tweet)

> False ::::SPACER:::: finished downloading more than 10 movies and one complete tv series from last night. Now… WHERE.DO.I.START. :)))))) #moviemarathon
> True ::::SPACER:::: @fouraroundworld thanks for sharing! Wishing you a wicked weekend :)
> True ::::SPACER:::: RT @abstex: The FT is backing the Tories. On an unrelated note, here's a photo of FT leader writer Jonathan Ford (next to Boris) http//t.c…
> False ::::SPACER:::: The show is called "Ask Nigel Farage", not "Nigel Farage answers what is asked". #AskNigelFarage
> False ::::SPACER:::: Packing is such a nightmare :(
> False ::::SPACER:::: @TheLastLeg #milibrandcuts 'discussing' tory manifesto and Johnny rock-hard?
> True ::::SPACER:::: RT @UKIP: Cheers! Official @UKIP account passes 100k followers #UKIP http//t.co/mdJ2MC95ek
> True ::::SPACER:::: Tomorrow at the #GoldCoast :) http//t.co/yRat44jWAy
> True ::::SPACER:::: RT @davidtorrance: Minority government will allow Ed Miliband to call Nicola Sturgeon’s bluff | Martin Kettle 

In [30]:
positive_review_ids = nltk.corpus.movie_reviews.fileids(categories=["pos"])
negative_review_ids = nltk.corpus.movie_reviews.fileids(categories=["neg"])
all_review_ids = positive_review_ids + negative_review_ids

In [39]:
from statistics import mean
def is_positive(review_id: str) -> bool:
    """True if the average of all sentance compound scores is positive."""
    text = nltk.corpus.movie_reviews.raw(review_id)
    scores = [
        sia.polarity_scores(sentance)["compound"]
        for sentance in nltk.sent_tokenize(text)
    ]
    return mean(scores) > 0

In [42]:
shuffle(all_review_ids)
correct = 0
for review_id in all_review_ids:
    if is_positive(review_id):
        if review_id in positive_review_ids:
            correct += 1
    else: 
        if review_id in negative_review_ids:
            correct += 1

print(F"{correct / len(all_review_ids):.2%} correct")

64.00% correct


In [49]:
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith("NN"):
        return False
    return True

positive_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=['pos']))
)]
negative_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["neg"]))
                )]

In [52]:
positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)

common_set = set(positive_fd).intersection(negative_fd)

for word in common_set:
    del  positive_fd[word]
    del negative_fd[word]
    
top_100_positive = {word for word, count in positive_fd.most_common(100)}
top_100_negative = {word for word, count in negative_fd.most_common(100)}

In [56]:
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

postiive_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(
w for w in nltk.corpus.movie_reviews.words(categories=["pos"])
if w.isalpha() and w not in unwanted
    )

negative_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(
    w for w in nltk.corpus.movie_reviews.words(categories=["neg"])
    if w.isalpha() and w not in unwanted
    )

In [None]:
### USE THE ABOVE FEATURES TO MAKE A BETTER SCORING FILTER

# Training and using a Classifier

In [57]:
def extract_features(text):
    features = dict()
    wordcount = 0
    compound_scores = list()
    positive_scores = list()
    
    for sentance in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentance):
            if word.lower() in top_100_positive:
                wordcount += 1
        compound_scores.append(sia.polarity_scores(sentance)["compound"])
        positive_scores.append(sia.polarity_scores(sentance)["pos"])
        
    # Adding 1 to the final compound score to always have positive numbers
    # since some classifiers used later don't work with negative numbers.
    
    features["mean_compound"] = mean(compound_scores) + 1
    features["mean_positive"] = mean(positive_scores)
    features["wordcount"] = wordcount
    
    return features

In [64]:
features = [
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "pos")
    for review in nltk.corpus.movie_reviews.fileids(categories=["pos"])
]
features.extend([
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "neg")
    for review in nltk.corpus.movie_reviews.fileids(categories=["neg"])
])

In [65]:
train_count = len(features) // 4
shuffle(features)
classifier = nltk.NaiveBayesClassifier.train(features[:train_count])
classifier.show_most_informative_features(10)

Most Informative Features
               wordcount = 2                 pos : neg    =      4.0 : 1.0
               wordcount = 0                 neg : pos    =      1.7 : 1.0
               wordcount = 1                 pos : neg    =      1.4 : 1.0


In [66]:
nltk.classify.accuracy(classifier, features[train_count:])

0.65

# Other Classifiers

# Sklearn

In [69]:
from sklearn.naive_bayes import (BernoulliNB, ComplementNB, MultinomialNB,)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [70]:
classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
    "AdaBoostClassifier": AdaBoostClassifier(),
}

In [71]:
features = [
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "pos")
    for review in nltk.corpus.movie_reviews.fileids(categories=["pos"])
]
features.extend([
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "neg")
    for review in nltk.corpus.movie_reviews.fileids(categories=["neg"])
])

In [72]:
train_count = len(features) // 4
shuffle(features)
for name, sklearn_classifier in classifiers.items():
    classifier = nltk.classify.SklearnClassifier(sklearn_classifier)
    classifier.train(features[:train_count])
    accuracy = nltk.classify.accuracy(classifier, features[train_count:])
    print(F"{accuracy:.2%} - {name}")

65.60% - BernoulliNB
65.60% - ComplementNB
65.60% - MultinomialNB
69.00% - KNeighborsClassifier
65.27% - DecisionTreeClassifier
68.73% - RandomForestClassifier
69.60% - LogisticRegression
70.60% - MLPClassifier
69.33% - AdaBoostClassifier
