## Listing of Noun Phrases From Reviews

In [1]:
import json
import dateutil
from pymongo import MongoClient
import re
from bson import json_util

import nltk
import nltk.tokenize

Same DB variables setup. This time I'm using the joined sample of 10, with the collection name `cleanedJoinedSample10`

*Most print() calls are commented out because its way too big, but these are for debugging*

In [2]:
client = MongoClient('localhost', 27017)
mydb = client["amazonPhones"]
joinedSample10Collection = mydb["cleanedJoinedSample10"]

testDocument = joinedSample10Collection.find_one()
#print(testDocument.items())


### addNounPhrasesCountField function
This function is to be performed on 1 individual review to add a dictionary containing all the noun phrases in its `reviewText` and their count (num of times used)

3 different grammar regex strings (this is a NL processing term).
supposedly, all of them split the sentence into chunks of noun phrases (using different methods(?))

googled a bunch of grammar strings and the 3rd one seems to have the least retarded results

In [3]:
# Test on first review
testReview = testDocument['review'][1]
# print(testReview)
def addNounPhrasesCountField(review):
    
    review['nounPhrasesCount'] = {}

    
    if 'reviewText' not in review:
        return
    
    
    testReviewText = review['reviewText']

    # Tokenising sentence
    word_tokenize = nltk.tokenize.word_tokenize
    tokens = word_tokenize(testReviewText)
#     print(tokens)

    tagged_tokens = nltk.pos_tag(tokens)

#     gram = ("NP: {<DT>?<JJ>*<NN>}")
#     gram = r"""NP: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" # more specific & effective noun phrase identifier

    gram = r"NP: {<DT|PRP\$>?<JJ>*<NN.*>+}"
    
    
    chunkParser = nltk.RegexpParser(gram)
    parsed_tree = chunkParser.parse(tagged_tokens)

    # print(parsed_tree)

    # print("SUBTREES ---------------------")

    # for tree in parsed_tree.subtrees():
    #     for leaf in tree.leaves():
    #         print(leaf)


    # each Leaf object contains [0] the word; and [1] the word category (eg. "something / NN")

    noun_phrases = [ ' '.join(leaf[0] for leaf in tree.leaves()).lower() #have to lower here
                    for tree in parsed_tree.subtrees() #        idk why if i lower earlier the parser fks up
                    if tree.label() == 'NP']

#     print(noun_phrases)

    counts = dict()
    for np in noun_phrases:
      counts[np] = counts.get(np, 0) + 1
    review['nounPhrasesCount'] = counts
#     print(counts)
#     print(review)

# addNounPhrasesCountField(testReview)

# print(testReview)


In [4]:
#test on review 3 (actually 4 since index 0)
reviewThree = testDocument['review'][3]

addNounPhrasesCountField(reviewThree)

print(reviewThree)

{'_id': ObjectId('6415a4959cb8dce7838b34e5'), 'overall': 5, 'vote': '32', 'verified': True, 'reviewTime': '09 8, 2015', 'reviewerID': 'A2JFID6PCLJPO6', 'asin': 'B00YD547Q6', 'style': {'Color:': ' Space Gray'}, 'reviewerName': 'Jahdale Logan', 'reviewText': 'PERFECT CONDITION', 'summary': 'Ok', 'unixReviewTime': 1441670400, 'nounPhrasesCount': {'perfect condition': 1}}


### gatherAllReviewNPCount
This is a function to be performed on 1 product document (basically 1 amazon product item).
It loops through all the product reviews and applies the previous `addNounPhrasesCountField` function to each review, and then during each review run, it adds to its own `totalNounPhrasesCount` dictionary.

(the `howMany` parameter is just for debugging, sth went wrong at review \#600+ because `reviewText` did not exist for that particular review)

In [5]:
def gatherAllReviewsNPCount(productDocument, howMany=0):
    productReviews = productDocument['review']
    productDocument['totalNounPhrasesCount'] = {}
    productDocumentCount = productDocument['totalNounPhrasesCount'] # just to make it shorter
#     for rev in productReviews:

#     doneCount = 0
    if (howMany != 0):
        for x in range(howMany):
            rev = productReviews[x]
            addNounPhrasesCountField(rev)
            #test and do for first 4 first

            for np in rev['nounPhrasesCount']:
                productDocumentCount[np] = productDocumentCount.get(np, 0) + rev['nounPhrasesCount'][np]
    else:
        for rev in productReviews:
#             print("doing else once for " + str(doneCount))
#             doneCount += 1
            addNounPhrasesCountField(rev)

            for np in rev['nounPhrasesCount']:
                productDocumentCount[np] = productDocumentCount.get(np, 0) + rev['nounPhrasesCount'][np]
    
#     print(productDocument)

#print(testDocument)
gatherAllReviewsNPCount(testDocument)
# print(testDocument)

In [6]:
#print(testDocument)

### Trying out on the sample of 10
finds all documents, performs the `gatherAllReviewsNPCount` on all of them and then outputs to json file

In [151]:
import json
from bson import json_util

cursor = joinedSample10Collection.find({})

documents = []
for document in cursor:
    gatherAllReviewsNPCount(document)
    documents.append(document)
    


## outputting    
documentsJsonString = json.dumps(documents, default=json_util.default)
# # write list of dicts to file
with open('outputWithNPCount.json', 'w') as outfile:
    outfile.write(documentsJsonString)

## Sentiment generation and average sentiment

In [7]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer



def addSentimentToReview(review, lyzer):


    if 'reviewText' not in review:
        return # have to account for this later
    
    
    testReviewText = review['reviewText']

    score = lyzer.polarity_scores(testReviewText)
    
    review['sentiment'] = score 
    

#before
#print(testReview)

lyzer = SentimentIntensityAnalyzer()
addSentimentToReview(testReview, lyzer)

#after
#print(testReview)



In [9]:
def generateSentiment(productDocument, lyzer):
    reviewsList = productDocument['review']
    
    averageSentiment = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
    # need to keep count because some reviews dont have sentiment (because no reviewText)
    reviewsWithSentimentCount = 0
    
    
    for rev in reviewsList:
        
        addSentimentToReview(rev, lyzer)
        if 'sentiment' in rev:
            reviewsWithSentimentCount += 1
            for key in averageSentiment:
                averageSentiment[key] += rev['sentiment'][key]
    
    
    for key in averageSentiment:
        averageSentiment[key] = averageSentiment[key] / reviewsWithSentimentCount
    
    productDocument['averageSentiment'] = averageSentiment


In [10]:
lyzer = SentimentIntensityAnalyzer()
generateSentiment(testDocument, lyzer)
#print(testDocument)

### Trying out with sample of 10
Reusing the `documents` list generated earlier

In [154]:
# test first doc
# documents[0]

for document in documents:
    generateSentiment(document, lyzer)
    
## outputting    
documentsJsonString = json.dumps(documents, default=json_util.default)
# # write list of dicts to file
with open('outputWithNPCountAndSentiment.json', 'w') as outfile:
    outfile.write(documentsJsonString)