In [1]:
import pandas as pd
import nltk
from pprint import pprint
from typing import List

In [2]:
nltk.download(["names","stopwords","state_union","twitter_samples","movie_reviews","averaged_perceptron_tagger","vader_lexicon","punkt",])

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\ineso\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ineso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to
[nltk_data]     C:\Users\ineso\AppData\Roaming\nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\ineso\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\ineso\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ineso\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up

True

In [3]:
df = pd.read_csv('../csv/reviews_clean.csv')
df.head()

Unnamed: 0,link,review
0,https://goodreads.com/book/show/1001220.Anarch...,This collection of essays is from 1994 and the...
1,https://goodreads.com/book/show/1001077.Hungar...,"When I read this book, I did not know what to ..."
2,https://goodreads.com/book/show/1001126.Hawaii,This was good. Very extensive. Helped in plann...
3,https://goodreads.com/book/show/1001092.Compet...,"No atendió mi expectativa, puede que se deba p..."
4,https://goodreads.com/book/show/10011431-r101,"Great book, full with construction details and..."


In [4]:
words = [w for w in nltk.corpus.state_union.words() if w.isalpha()]

In [5]:
stopwords = nltk.corpus.stopwords.words("english")

In [6]:
words = [w for w in words if w.lower() not in stopwords]

In [7]:
text = """
For some quick analysis, creating a corpus could be overkill.
If all you need is a word list,
there are simpler ways to achieve that goal."""
pprint(nltk.word_tokenize(text), width=79, compact=True)

['For', 'some', 'quick', 'analysis', ',', 'creating', 'a', 'corpus', 'could',
 'be', 'overkill', '.', 'If', 'all', 'you', 'need', 'is', 'a', 'word', 'list',
 ',', 'there', 'are', 'simpler', 'ways', 'to', 'achieve', 'that', 'goal', '.']


In [8]:
words: List[str] = nltk.word_tokenize(text)
fd = nltk.FreqDist(words)

In [9]:
fd.most_common(3)

[(',', 2), ('a', 2), ('.', 2)]

In [10]:
fd.tabulate(3)

, a . 
2 2 2 


In [11]:
lower_fd = nltk.FreqDist([w.lower() for w in fd])

In [12]:
lower_fd.most_common(3)

[(',', 1), ('a', 1), ('.', 1)]

In [13]:
text = nltk.Text(nltk.corpus.state_union.words())
text.concordance("america", lines=5)

Displaying 5 of 1079 matches:
 would want us to do . That is what America will do . So much blood has already
ay , the entire world is looking to America for enlightened leadership to peace
beyond any shadow of a doubt , that America will continue the fight for freedom
 to make complete victory certain , America will never become a party to any pl
nly in law and in justice . Here in America , we have labored long and hard to 


In [14]:
concordance_list = text.concordance_list("america", lines=2)
for entry in concordance_list:
    print(entry.line)

 would want us to do . That is what America will do . So much blood has already
ay , the entire world is looking to America for enlightened leadership to peace


In [15]:
from nltk.sentiment import SentimentIntensityAnalyzer
from random import shuffle
from statistics import mean

In [16]:
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("Wow, NLTK is really powerful!")

{'neg': 0.0, 'neu': 0.295, 'pos': 0.705, 'compound': 0.8012}

In [17]:
reviews = list(df['review'])

In [18]:
def is_positive(tweet: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise."""
    return sia.polarity_scores(tweet)["compound"] > 0

In [25]:
positive_review_ids = nltk.corpus.movie_reviews.fileids(categories=["pos"])
negative_review_ids = nltk.corpus.movie_reviews.fileids(categories=["neg"])
all_review_ids = positive_review_ids + negative_review_ids

In [26]:
def is_positive(text: str) -> bool:
    """True if the average of all sentence compound scores is positive."""
    scores = [
        sia.polarity_scores(sentence)["compound"]
        for sentence in nltk.sent_tokenize(text)
    ]
    return mean(scores) > 0

In [28]:
for review in reviews[:10]:
    print(is_positive(review))
    print(review)

True
This collection of essays is from 1994 and therefore, a lot of people in the anarchist movement might be already familiar with its ideas, in general terms.Nevertheless it's an interesting look at anarchism and its relation to the natural world, though the title itself its somewhat misleading in that the scope of the book is broader than the title implies, there are essays on anarchism's relationship with feminism and a critique of Murray Bookchin, which rehearses a few of the factionalist controversies of the time, between left and post-left, lifestyle anarchism, traditional anarcho-syndicalism, which, together with the arcane differences between sociobiology and social ecology, have always failed to hold my interest.Briefly, I can say that I come down more on the side of the seemingly more pragmatic Bookchin. The essays here presented are more engaging when they deal with more practical subjects, I found the essays on feminism, on animal rights, and on self-defense, say, more apt

In [19]:
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith("NN"):
        return False
    return True

positive_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["pos"]))
)]
negative_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["neg"]))
)]

In [20]:
positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)

common_set = set(positive_fd).intersection(negative_fd)

for word in common_set:
    del positive_fd[word]
    del negative_fd[word]

top_100_positive = {word for word, count in positive_fd.most_common(100)}
top_100_negative = {word for word, count in negative_fd.most_common(100)}

In [21]:
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

positive_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["pos"])
    if w.isalpha() and w not in unwanted
])
negative_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["neg"])
    if w.isalpha() and w not in unwanted
])

In [22]:
def extract_features(text):
    features = dict()
    wordcount = 0
    compound_scores = list()
    positive_scores = list()

    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            if word.lower() in top_100_positive:
                wordcount += 1
        compound_scores.append(sia.polarity_scores(sentence)["compound"])
        positive_scores.append(sia.polarity_scores(sentence)["pos"])

    # Adding 1 to the final compound score to always have positive numbers
    # since some classifiers you'll use later don't work with negative numbers.
    features["mean_compound"] = mean(compound_scores) + 1
    features["mean_positive"] = mean(positive_scores)
    features["wordcount"] = wordcount

    return features

In [23]:
features = [
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "pos")
    for review in nltk.corpus.movie_reviews.fileids(categories=["pos"])
]
features.extend([
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "neg")
    for review in nltk.corpus.movie_reviews.fileids(categories=["neg"])
])

In [43]:
train_count = int(len(features) * 0.8)
shuffle(features)
classifier = nltk.NaiveBayesClassifier.train(features[:train_count])
classifier.show_most_informative_features(10)
nltk.classify.accuracy(classifier, features[train_count:])

Most Informative Features
               wordcount = 3                 pos : neg    =     25.2 : 1.0
               wordcount = 5                 pos : neg    =     13.1 : 1.0
               wordcount = 4                 pos : neg    =      5.0 : 1.0
               wordcount = 2                 pos : neg    =      4.0 : 1.0
               wordcount = 0                 neg : pos    =      1.7 : 1.0
               wordcount = 1                 pos : neg    =      1.5 : 1.0
           mean_positive = 0.05566666666666667    pos : neg    =      1.0 : 1.0
           mean_positive = 0.08418181818181818    pos : neg    =      1.0 : 1.0
           mean_positive = 0.08611764705882353    pos : neg    =      1.0 : 1.0
           mean_positive = 0.09154545454545454    pos : neg    =      1.0 : 1.0


0.6525

In [45]:
from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [47]:
classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
    "AdaBoostClassifier": AdaBoostClassifier(),
}

In [48]:
train_count = int(len(features) * 0.8)
shuffle(features)
for name, sklearn_classifier in classifiers.items():
    classifier = nltk.classify.SklearnClassifier(sklearn_classifier)
    classifier.train(features[:train_count])
    accuracy = nltk.classify.accuracy(classifier, features[train_count:])
    print(F"{accuracy:.2%} - {name}")

70.00% - BernoulliNB
70.00% - ComplementNB
70.00% - MultinomialNB
71.25% - KNeighborsClassifier
66.75% - DecisionTreeClassifier
73.25% - RandomForestClassifier
76.75% - LogisticRegression
75.75% - MLPClassifier
75.00% - AdaBoostClassifier


In [58]:
train_count = int(len(features) * 0.8)
shuffle(features)
classifier = nltk.classify.SklearnClassifier(MLPClassifier(max_iter=1000))
classifier.train(features[:train_count])
accuracy = nltk.classify.accuracy(classifier, features[train_count:])
print(F"{accuracy:.2%} - {'MLP'}")

72.00% - MLP


In [59]:
review = reviews[20]
print(review)
print(classifier.classify(extract_features(review)))

An excellent overview of Indian experiences in the Civil War, focusing on several tribes and individuals. Covers military experiences as well as home life, and broader effects on tribes. Some background on their histories before and after the war. Short, but detailed. 
neg
