## Sentiment Intensity Analyzer


In [1]:
%matplotlib inline
import urllib.request, os, gzip

datadir = './data/'

def download_data(dataset_name, datadir):
    filename = 'reviews_%s_5.json' % dataset_name
    filepath = os.path.join(datadir, filename)
    if os.path.exists(filepath):
        print("Dataset %s has already been downloaded to %s" % (dataset_name, datadir))
    else:
        url = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/%s.gz' % filename
        urllib.request.urlretrieve(url, filepath + ".gz")
        with gzip.open(filepath + ".gz", 'rb') as fin:
            with open(filepath, 'wb') as fout:
                fout.write(fin.read())
        print("Downloaded dataset %s and saved it to %s" % (dataset_name, datadir))

dataset = "Baby"
download_data(dataset, datadir)

Dataset Baby has already been downloaded to ./data/


In [2]:
import json

def load_data(dataset_name, datadir):
    filepath = os.path.join(datadir, 'reviews_%s_5.json' % dataset_name)
    if not os.path.exists(filepath):
        download_data(dataset_name, datadir)
    data = []
    with open(filepath, 'r') as f:
        for line in f:                            # read file line by line
            item_hash = hash(line)                # we will use this later for partitioning our data 
            item = json.loads(line)               # convert JSON string to Python dict
            item['hash'] = item_hash              # add hash for identification purposes
            data.append(item)
    print("Loaded %d data for dataset %s" % (len(data), dataset_name))
    return data

# load the data...
baby = load_data(dataset, datadir)

Loaded 160792 data for dataset Baby


In [3]:
def partition_train_validation_test(data):
    # 60% : modulus is 0, 1, 2, 3, 4, or 5
    data_train = [item for item in data if item['hash'] % 10 <= 5]
    # 20% : modulus is 6 or 7
    data_valid = [item for item in data if item['hash'] % 10 in [6,7]] 
    # 20% : modulus is 8 or 9
    data_test  = [item for item in data if item['hash'] % 10 in [8,9]] 
    return data_train, data_valid, data_test
    
baby_train, baby_valid, baby_test = partition_train_validation_test(baby)

print("We have", len(baby_train), "training examples,", len(baby_valid),
      "validation examples, and", len(baby_test), "test examples.")

We have 96102 training examples, 32338 validation examples, and 32352 test examples.


In [5]:
from nltk.corpus import opinion_lexicon
import random

positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

random.seed(1234)
print("Some positive words:", ", ".join(random.sample(positive_words, 10)))
print("Some negative words:", ", ".join(random.sample(negative_words, 10)))

intersection = positive_words & negative_words
print("Words that appear in both sets: " + ", ".join(intersection))

Some positive words: like, empathize, attractively, logical, invigorate, eagerly, positives, succes, outstandingly, invaluable
Some negative words: lame, demoralizing, smack, brash, frustrated, anti-white, whore, stall, hung, pillage
Words that appear in both sets: enviousness, envious, enviously


In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

# English language stop words
eng_stopwords = set(stopwords.words('english'))

def my_tokenize(text):
    """
    Split text into lower-case tokens, removing all-punctuation tokens and stopwords
    """
    tokens = []
    for sentence in sent_tokenize(text):
        tokens.extend(x for x in word_tokenize(sentence.lower()) 
                      if x not in eng_stopwords and any(i.isalpha() for i in x))
    return tokens

def pos_neg_fraction(text):
    """
    Return the fraction of positive and negative words in a text
    """
    tokens = my_tokenize(text)
    count_pos, count_neg = 0, 0
    for t in tokens:
        if t in positive_words:
            count_pos += 1
        if t in negative_words:
            count_neg += 1
    count_all = len(tokens)
    if count_all != 0:
        return count_pos/count_all, count_neg/count_all
    else:
        return 0., 0.

In [7]:
import numpy

def dataset_to_matrix(data):
    """Extract our feature matrix from the dataset"""
    return numpy.array([list(pos_neg_fraction(item['reviewText'])) for item in data])

X_train = dataset_to_matrix(baby_train)

def dataset_to_targets(data):
    """Extract our target array from the dataset"""
    return numpy.array([item['overall'] for item in data])

Y_train = dataset_to_targets(baby_train)

In [9]:
from nltk.sentiment.util import mark_negation

examples_negation = ["This product wasn't bad.",
                     "This is not a bad product.",
                     "This product was bad.",
                     "This is a bad product."]

for sentence in examples_negation:
    tokens_with_negation = mark_negation(word_tokenize(sentence.lower()))
    print("Sentence =", sentence)
    print(tokens_with_negation)

negated_stopwords = set(x + "_NEG" for x in eng_stopwords)
all_stopwords = eng_stopwords.union(negated_stopwords)        # set union
    
def tokenize_with_negation(text):
    """
    Split text into lower-case tokens, removing all-punctuation tokens and stopwords
    """ 
    tokens = []
    for sentence in sent_tokenize(text):
        pretokens = word_tokenize(sentence.lower())
        pretokens = [x for x in pretokens if any(i.isalpha() for i in x)]
        pretokens = mark_negation(pretokens)
        tokens.extend(x for x in pretokens if x not in all_stopwords)
    return tokens

print(baby_train[31]['reviewText'])
print()
print(tokenize_with_negation(baby_train[31]['reviewText']))

Sentence = This product wasn't bad.
['this', 'product', 'was', "n't", 'bad_NEG', '.']
Sentence = This is not a bad product.
['this', 'is', 'not', 'a_NEG', 'bad_NEG', 'product_NEG', '.']
Sentence = This product was bad.
['this', 'product', 'was', 'bad', '.']
Sentence = This is a bad product.
['this', 'is', 'a', 'bad', 'product', '.']
I used this for my little on and it was really nice for me as a new mom to have everything so well organized.  There was sufficient space for notes I didn't think, but all in all it was nice to have everything laid out.  If you're organized you could easily make one of these yourself and not spend the money, but when we had our baby I just needed one thing done for me and not have to do it myself and for that it was very helpful.

['used', 'little', 'really', 'nice', 'new', 'mom', 'everything', 'well', 'organized', 'sufficient', 'space', 'notes', "n't", 'think_NEG', 'nice_NEG', 'everything_NEG', 'laid_NEG', "'re", 'organized', 'could', 'easily', 'make', 'on

In [10]:
all_positive_words = positive_words.union({x + "_NEG" for x in negative_words})
all_negative_words = negative_words.union({x + "_NEG" for x in positive_words})

def pos_neg_fraction_with_negation(text):
    tokens = tokenize_with_negation(text)
    # count how many positive and negative words occur in the text
    count_pos, count_neg = 0, 0
    for t in tokens:
        if t in all_positive_words:
            count_pos += 1
        if t in all_negative_words:
            count_neg += 1
    count_all = len(tokens)
    if count_all != 0:
        return count_pos/count_all, count_neg/count_all
    else:  # avoid division by zero
        return 0., 0.
    
pos_example = 'This is a good, great, fantastic, amazing, wonderful, super product!!!'
neg_example = 'This is a bad, atrocious, terrible, dreadful, awful, abysmal product!!!'
print(pos_neg_fraction_with_negation(pos_example))
print(pos_neg_fraction_with_negation(neg_example))    

(0.8571428571428571, 0.0)
(0.0, 0.8571428571428571)


In [11]:
def dataset_to_matrix_with_neg(data):
    return numpy.array([list(pos_neg_fraction_with_negation(item['reviewText'])) for item in data])

X_train_neg = dataset_to_matrix_with_neg(baby_train)

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [20]:
print()
print(baby_train[400]['reviewText'])
print()


it will NOT make a regular octagon, even if you have 8 panelsthe problem is, it  only opens to certain angles, and 135 degrees is not one of themit only does 90, 120, and 150(but 120 degrees is good for a hexagon, just don;t think if you have to have any old number of panels, you can make any polygon you want, you can;t.. additionally the SQUARE, which you can make with the 90 degree angles, are not very strong and not recommended by the instructions, especially if you are thinking, i can have use 8 panels to make a square, it will be very flimsy in the middle of the straight junction.. they specifically say DO NOT do that, have 2 sides joined at a 0 degree angle.. basically a hexagon is about all it really works for.. though a 4 sided square works, it isn;t weak, it is just tiny..)so you can HAVE an 8-sided fencebut the angles will have to be half 120 and half 150 degreeswhich is more of a square, just with bent sidesnot a real regular stop-sign type octagon!@#$plus it;s a little fli

In [24]:
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
text = baby_train[400]['reviewText']

print()
for s in sent_tokenize(text):
    print(s)
    print(sia.polarity_scores(s))
    print()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Solange\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!

it will NOT make a regular octagon, even if you have 8 panelsthe problem is, it  only opens to certain angles, and 135 degrees is not one of themit only does 90, 120, and 150(but 120 degrees is good for a hexagon, just don;t think if you have to have any old number of panels, you can make any polygon you want, you can;t.. additionally the SQUARE, which you can make with the 90 degree angles, are not very strong and not recommended by the instructions, especially if you are thinking, i can have use 8 panels to make a square, it will be very flimsy in the middle of the straight junction.. they specifically say DO NOT do that, have 2 sides joined at a 0 degree angle.. basically a hexagon is about all it really works for.. though a 4 sided square works, it isn;t weak, it is just tiny..)so you can HAVE an 8-sided fencebut 

In [37]:
def sia_features(dataset):
    """For each review text in the dataset, extract:
       (1) the mean positive sentiment over all sentences
       (2) the mean neutral sentiment over all sentences
       (3) the mean negative sentiment over all sentences
       (4) the maximum positive sentiment over all sentences
       (5) the maximum neutral sentiment over all sentences
       (6) the maximum negative sentiment over all sentences"""
    feat_matrix = numpy.empty((len(dataset), 6))
    for i in range(len(dataset)):
        sentences = sent_tokenize(dataset[i]['reviewText'])
        nsent = len(sentences)
        if nsent:
            sentence_polarities = numpy.empty((nsent, 3))
            for j in range(nsent):
                polarity = sia.polarity_scores(sentences[j])
                sentence_polarities[j, 0] = polarity['pos']
                sentence_polarities[j, 1] = polarity['neu']
                sentence_polarities[j, 2] = polarity['neg']
            feat_matrix[i, 0:3] = numpy.mean(sentence_polarities, axis=0) # mean over the columns
            feat_matrix[i, 3:6] = numpy.max(sentence_polarities, axis=0) # maximum over the columns
        else:
            feat_matrix[i, 0:6] = 0.0
    return feat_matrix

sia_tr = sia_features(baby_train)
print(sia_tr[:10])

[[0.23533333 0.76466667 0.         0.552      1.         0.        ]
 [0.03833333 0.84266667 0.119      0.115      1.         0.357     ]
 [0.12725    0.8365     0.03625    0.194      0.865      0.145     ]
 [0.1595     0.7775     0.063      0.319      0.874      0.126     ]
 [0.073      0.927      0.         0.219      1.         0.        ]
 [0.3115     0.6885     0.         0.571      1.         0.        ]
 [0.172      0.8174     0.0106     0.398      1.         0.053     ]
 [0.174125   0.825875   0.         0.339      1.         0.        ]
 [0.0307     0.9106     0.0587     0.122      1.         0.257     ]
 [0.1035     0.87575    0.02075    0.279      1.         0.096     ]]


In [26]:
testmat = numpy.arange(12.).reshape((3, 4))
print("testmat is:")
print(testmat)

print("\ntestmat max is:")
print(numpy.max(testmat, axis=0))

print("\ntestmat mean is:")
print(numpy.mean(testmat, axis=1))

testmat is:
[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]

testmat max is:
[ 8.  9. 10. 11.]

testmat mean is:
[1.5 5.5 9.5]


In [27]:
def len_features(dataset):
    """Add two features:
       (1) length of review (in thousands of characters) - truncate at 2,500
       (2) percentage of exclamation marks (in %)"""
    feat_matrix = numpy.empty((len(dataset), 2))
    for i in range(len(dataset)):
        text = dataset[i]['reviewText']
        feat_matrix[i, 0] = len(text) / 1000.
        if text:
            feat_matrix[i, 1] = 100. * text.count('!') / len(text)
        else:
            feat_matrix[i, 1] = 0.0
    feat_matrix[feat_matrix>2.5] = 2.5
    return feat_matrix

len_tr = len_features(baby_train)

In [29]:
print(X_train_neg.shape, sia_tr.shape, len_tr.shape)

(96102, 2) (96102, 6) (96102, 2)


In [30]:
# stack horizontally
X_train_augmented = numpy.concatenate((X_train_neg, sia_tr, len_tr), axis=1) 
lreg_augmented = LinearRegression().fit(X_train_augmented, Y_train)
pred_train_augmented = lreg_augmented.predict(X_train_augmented)
mae_train_augmented = mean_absolute_error(pred_train_augmented, Y_train)
print("Now the mean absolute error on the training data is %f stars" % mae_train_augmented)

Now the mean absolute error on the training data is 0.753687 stars


In [34]:
from sklearn.ensemble import RandomForestRegressor

rf_augmented = RandomForestRegressor().fit(X_train_augmented, Y_train)
rfpred_train_augmented = rf_augmented.predict(X_train_augmented)
mae_train_rf_augmented = mean_absolute_error(rfpred_train_augmented, Y_train)
print("For the RF, it is %f stars" % mae_train_rf_augmented)

For the RF, it is 0.281091 stars


In [38]:
X_valid_neg = dataset_to_matrix_with_neg(baby_valid)
Y_valid = dataset_to_targets(baby_valid)

sia_valid = sia_features(baby_valid)
len_valid = len_features(baby_valid)

X_valid_augmented = numpy.concatenate((X_valid_neg, sia_valid, len_valid), axis=1)
pred_valid_augmented = lreg_augmented.predict(X_valid_augmented)
pred_valid_rf_augmented = rf_augmented.predict(X_valid_augmented)

mae_valid_augmented = mean_absolute_error(pred_valid_augmented, Y_valid)
print("On the validation set, we get %f error for the linear regression" % mae_valid_augmented)

mae_valid_rf_augmented = mean_absolute_error(pred_valid_rf_augmented, Y_valid)
print("And %f for the random forest regression" % mae_valid_rf_augmented)

On the validation set, we get 0.758383 error for the linear regression
And 0.733395 for the random forest regression



### Homework

**Refactor the code above:**
- "Be lazy. Not just lazy but proactively, agressively lazy": remove duplication.
- Create a single function that takes in data and spits out all success metrics across all of your algorithms.


## Where to go from here?


- Unigrams (NLTK)
- Word vector (gensim, [glove][1], word2vec)
- Recurrent neural net
- Convolutional neural net

**References:**

- [Perform sentiment analysis with LSTMS using TensorFlow][2]
- [Understanding convolutional neural networks for NLP][3]
- [Develop N-Gram multichannel convolutional neural network sentiment analysis][4]

[1]: https://nlp.stanford.edu/projects/glove/
[2]: https://www.oreilly.com/learning/perform-sentiment-analysis-with-lstms-using-tensorflow
[3]: http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
[4]: https://machinelearningmastery.com/develop-n-gram-multichannel-convolutional-neural-network-sentiment-analysis/