# Processing Flow

The processing flow is run on all of the review data once the models are all trained up and validated. This is the flow that will actually turn the raw data into processed output.

# 1. Token Replacement

This step allows Alchemy to replace words in the sentences of the reviews by their semantic types (product, customer_service, company, etc). These semantic types were defined when the WKS model was trained and are usually associated with a given domain data.

It replaces tokens from the 'reviewText' field of the documents stored in the database. It saves the replaced sentences to the 'taggedRevie' field in the same document.

In [None]:
import ast
import re
import nltk
import os
import logging
import configparser
import csv
from watson_developer_cloud import alchemy_language_v1 as alchemy

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

#getting current directory
curdir = os.getcwd()
logger.debug(curdir)

#loading credentials from .env file
credFilePath = os.path.join(curdir,'..','.env')
config = configparser.ConfigParser()
config.read(credFilePath)
logger.debug(config.sections())

model_id = config['WKS']['WKS_MODEL_ID']
alchemy_api = alchemy.AlchemyLanguageV1(api_key = 
                    config['ALCHEMY']['ALCHEMY_API_KEY'])


def get_entities(review):
    split = {}
    if len(review) > 5024:
        mid = find_middle(review)
        while mid >= 5024:
            mid = find_middle(review[:mid])
        review = review[:mid]
        half = review[mid:]
        split = get_entities(half)
    f = alchemy_api.entities(text=review, model='', sentiment=True)
    response = f.content
    response = ast.literal_eval(response)
    if split != {}:
        if 'entities' in split and 'entities' in response:
            response['entities'] = response['entities'] + split['entities']
            response['text'] = response['text'] + split['text']
        elif 'entites' in split and 'entities' not in response:
            response['entities'] = split['entities']
    return response


def token_replacement_entities(review):
    processed = get_entities(review)
    if 'statusInfo' in processed:
        return review
    if 'entities' in processed:
        entities = processed['entities']
        text = processed['text']
        for i in entities:
            token = i['text']
            classification = "<" + i['type'] + ">"
            token = re.escape(token)
            re.sub(r'\\ ', ' ', token)
            text = re.sub(r"\b%s\b" % token, classification, text, count=1)
    return text


def find_middle(text):
        generator = nltk.tokenize.util.regexp_span_tokenize(text, r'\.')
        sequences = list(generator)
        mid_sentence = len(sequences)/2
        middle_char = sequences[mid_sentence][1]
        middle_char = int(middle_char) + 1
        return middle_char
    
#Initializing Cloudant client
client = cloudant.client.Cloudant(config['CLOUDANT']['CLOUDANT_USERNAME'],
                                  config['CLOUDANT']['CLOUDANT_PASSWORD'],
                                  account=config['CLOUDANT']['CLOUDANT_USERNAME'])

#Going through all the documents and replacing the tokens by
#their semantic types. Result is save back to the Cloudant document
#in the 'taggedReview' field.
client.connect()
db = client[config['CLOUDANT']['CLOUDANT_DB']]
for doc in db:
    logger.debug(doc)
    try:
        doc['taggedReview'] = token_replacement_entities(doc['reviewText'])
        doc.save()
    except:
        logger.error('Error saving tagged review to Cloudant document.')

# 2. Classification of reviews

This step uses the Natural Language Classifier (NLC) created on the Training notebook. This step classifies a review and adds the result of the classification to the 'class' field of the document in the database.

In [None]:
import json
from watson_developer_cloud import NaturalLanguageClassifierV1
import sys
import os
import logging
import configparser
import cloudant

logger = logging.getLogger()
logger.setLevel(logging.INFO)

#getting current directory
curdir = os.getcwd()
logger.debug(curdir)

#loading credentials from .env file
credFilePath = os.path.join(curdir,'..','.env')
config = configparser.ConfigParser()
config.read(credFilePath)

NLC_USERNAME = config['NLC']['NLC_USERNAME']
NLC_PASSWORD = config['NLC']['NLC_PASSWORD']
NLC_CLASSIFIER = config['NLC']['NLC_CLASSIFIER']

#initializing classifier object
nlc = NaturalLanguageClassifierV1(username=NLC_USERNAME, 
                                  password=NLC_PASSWORD)

def classify(review):
    logger.debug(review)
    #Classify sentence
    try:
        response = nlc.classify(NLC_CLASSIFIER, review)
        logger.debug(response)
        if len(response['classes']) > 1:
            return response['classes']
    except:
        logger.error('Failed at sentence classification')
        return 'no class'

#Initializing Cloudant client
client = cloudant.client.Cloudant(config['CLOUDANT']['CLOUDANT_USERNAME'],
                                  config['CLOUDANT']['CLOUDANT_PASSWORD'],
                                  account=config['CLOUDANT']['CLOUDANT_USERNAME'])

client.connect()
db = client[config['CLOUDANT']['CLOUDANT_DB']]
for doc in db:
    logger.debug(doc)
    try:
        doc['class'] = classify(doc['taggedReview'])
        doc.save()
    except:
        logger.error('Error saving classification to Cloudant document.')

# 3. Grouping products that have reviews

This script will create a product document in the database for each product that has been reviewed. It will also attach a list of the review id's to the new product document.


In [7]:
from cloudant.client import Cloudant
import os
import logging
import configparser

#getting current directory
curdir = os.getcwd()
logger.debug(curdir)

#loading credentials from .env file
credFilePath = os.path.join(curdir,'..','.env')
config = configparser.ConfigParser()
config.read(credFilePath)

client = cloudant.client.Cloudant(config['CLOUDANT']['CLOUDANT_USERNAME'],
                                  config['CLOUDANT']['CLOUDANT_PASSWORD'],
                                  account=config['CLOUDANT']['CLOUDANT_USERNAME'])
client.connect()
db = client[DATABASE]
products = {}
for doc in db:
    if doc['title']:
        if doc['title'] not in products:
            products[doc['title']] = {}
            products[doc['title']]['product_name'] = doc['title']
            products[doc['title']]['customer_service'] = {'sentiment': {}}
            products[doc['title']]['customer_service']['sentiment']['posCount'] = 0
            products[doc['title']]['customer_service']['sentiment']['neuCount'] = 0
            products[doc['title']]['customer_service']['sentiment']['negCount'] = 0
            products[doc['title']]['features'] = []
            products[doc['title']]['issues'] = {'count': 0}
            products[doc['title']]['reviewCount'] = 0
            products[doc['title']]['product_id'] = doc['asin']
    
            products[doc['title']]['reviewCount'] += 1
        if doc['title']['sentiment'] == 'positive':
            products[doc['title']]['customer_service']['sentiment']['posCount'] += 1
        elif doc['title']['sentiment'] == 'neutral':
            products[doc['title']]['customer_service']['sentiment']['neuCount'] += 1
        elif doc['title']['sentiment'] == 'negative':
            products[doc['title']]['customer_service']['sentiment']['negCount'] += 1
    
    #Add detected features list. I'm not sure how it's being stored during the alchemy call atm
    #products[doc['title']]['features'].append(doc['title']['features'])
    
    #There is probably some other stuff that needs to be formatted here
    #Until we can step through the whole process this is just kinda a guess
    
for product, value in products.items():
    print product
    #Need to push product to db whenever we're done here

{u'Samsung UN19F4000 19-Inch 720p 60Hz Slim LED HDTV': {'reviewCount': 164, 'product_id': u'B00BCGRZ04', 'customer_service': {'sentiment': {}}, 'product_name': u'Samsung UN19F4000 19-Inch 720p 60Hz Slim LED HDTV', 'issues': {}, 'features': []}, u'Bose QuietComfort 15 Acoustic Noise Cancelling Headphones': {'reviewCount': 804, 'product_id': u'B0054JJ0QW', 'customer_service': {'sentiment': {}}, 'product_name': u'Bose QuietComfort 15 Acoustic Noise Cancelling Headphones', 'issues': {}, 'features': []}, u'Microsoft Comfort Mouse 4500': {'reviewCount': 118, 'product_id': u'B003BEDPHM', 'customer_service': {'sentiment': {}}, 'product_name': u'Microsoft Comfort Mouse 4500', 'issues': {}, 'features': []}}
Samsung UN19F4000 19-Inch 720p 60Hz Slim LED HDTV
164
Bose QuietComfort 15 Acoustic Noise Cancelling Headphones
804
Microsoft Comfort Mouse 4500
118


# 4. Clustering product feature sentences

The script that runs the clustering on the review data is src/Processing/clustering.py
A word2vec model is provided

In [None]:
from cloudant.client import Cloudant
from cloudant.query import Query
from gensim.models import word2vec
import logging
import numpy as np
import re
import os
import configparser

def generate_vectors(features, model):
    vecs = []
    mapping = []
    count = 0
    for line in features:
        words = line['word'].split()
        vec = []
        flag = 0
        for word in words:
            word = str(word)
            word = re.escape(word)
            word = re.sub(r'\\', '', word)
            if word in model:
                if len(vec) > 1:
                    vec = vec+model[word]
                else:
                    vec = model[word]
            else:
                flag = 1
                break
        if flag == 0:
            if len(vec) > 0:
                vecs.append(vec)
                mapping.append(count)
        count += 1
    return [vecs, mapping]


def cluster_try(vecs):
        clusterVec = {}
        clusterIdx = {}
        no_of_clusters = 1
        clusterIdx[0] = [0]
        clusterVec[0] = vecs[0]
        max_sim = 0.5
        index = 0
        for i in range(1, len(vecs)):
                flag = 0
                max_sim = 0.5
                for j in range(no_of_clusters):
                        sim = np.dot(vecs[i], clusterVec[j])/(np.linalg.norm(clusterVec[j]) * np.linalg.norm(vecs[i]))
                        if sim > max_sim:
                                flag = 1
                                max_sim = sim
                                index = j
                if flag == 0:
                        clusterIdx[j+1] = [i]
                        clusterVec[j+1] = vecs[i]
                        no_of_clusters += 1
                else:
                        clusterIdx[index].append(i)
                        clusterVec[index] += vecs[i]
        return clusterIdx

##I think this function can be deleted or at least drastically reduced given the change to groupings done earlier
def create_json(clusters, cluster_data, mapping, keys, helpful, local_dump):
    for i in clusters:
        keyword_count = 0
        pos = 0
        neg = 0
        neutral = 0
        unique_words = {}
        clusterinfo = {}
        for key in clusters[i]:
            index = mapping[key]
            keyword = keys[index]['word']
            if keyword in unique_words:
                unique_words[keyword]['count'] += 1
                unique_words[keyword]['review_id'].append(keys[index]['rev_id'])
                unique_words[keyword]['sentence_id'].append(keys[index]['sentence_id'])
            else:
                unique_words[keyword] = {}
                unique_words[keyword]['count'] = 1
                unique_words[keyword]['review_id'] = [keys[index]['rev_id']]
                unique_words[keyword]['sentence_id'] = [keys[index]['sentence_id']]
            keyword_count += 1
            list_keywords = []
            stop_count = 0
            for feature in sorted(unique_words, key=unique_words.get, reverse=True):
                data = {}
                data['keyword'] = feature
                if stop_count == 0:
                    clusterinfo['feature'] = feature
                data['sentence_id'] = unique_words[feature]['sentence_id']
                data['review_id'] = unique_words[feature]['review_id']
                helpful_vote=0
                for index_rev in range(0,len(data['review_id'])):
                    if helpful[data['review_id'][index_rev]]>=helpful_vote:
                        helpful_vote=helpful[data['review_id'][index_rev]]
                        helpful_rev=index_rev

                sent_id=data['sentence_id'][helpful_rev]
                helpful_review=local_dump[data['review_id'][helpful_rev]]

                ##cause of split reviews-to remove
                sent_id=sent_id-helpful_review[0][0]['seqno']
                if sent_id>0:
                    excerpt=helpful_review[0][sent_id-1]['sentence']+helpful_review[0][sent_id]['sentence']
                else:
                    excerpt=helpful_review[0][sent_id]['sentence']
                if sent_id<len(helpful_review[0])-1:
                    excerpt=excerpt+helpful_review[0][sent_id+1]['sentence']

                data['excerpt']=excerpt

                data['count'] = unique_words[feature]['count']
                list_keywords.append(data)
                stop_count += 1
                if stop_count == 3:
                    break
            if keys[index]['sentiment'][0][0] == 'positive':
                    pos += 1
            if keys[index]['sentiment'][0][0] == 'neutral':
                    neutral += 1
            if keys[index]['sentiment'][0][0] == 'negative':
                    neg += 1
        clusterinfo['keywords'] = list_keywords
        clusterinfo['sentiments'] = {}
        clusterinfo['sentiments']['positive'] = pos
        clusterinfo['sentiments']['negative'] = neg
        clusterinfo['sentiments']['neutral'] = neutral
        clusterinfo['keyword_count'] = keyword_count
        cluster_data.append(clusterinfo)
    return cluster_data

   
    #Since we did grouping in the previous step this shouldn't be necessary
    ####start###############
#def cluster(doc, db, asin):
    #SERVER = 'https://1790ef54-fcf2-4029-9b73-9000dff88e6e-bluemix.cloudant.com'
    #DATABASE = 'amazon_data'
    #USERNAME = '1790ef54-fcf2-4029-9b73-9000dff88e6e-bluemix'
    #PASSWORD = '5beb3f8b9f95586542e3d9c5acfb0c52832252432623e534d4e88b12fad29638'

    #server = cloudant.client.Cloudant(USERNAME, PASSWORD, url=SERVER)
    #server.connect()
    #db = server[DATABASE]
    #query = Query(db, selector={'asin': asin, 'type':['review']},fields=["_id", "helpful"])
    #meta = Query(db, selector={'asin': asin, 'type': ['metadata']})
    #meta = meta.result[0][0]
    #name = ''
    #if 'title' in meta:
    #    name = meta['title']
    #rev_id = []
    #helpful={}
    #for data in query.result:
    #    rev_id.append(data['_id'])
    #    if 'helpful' in data:
    #        helpful[data['_id']]=data['helpful'][0]
    #    else:
    #        helpful[data['_id']]=0

    #temp = {}
    #keys = []
    #local_dump = {}
    #for rev in rev_id:
    #   query_id = Query(db, selector={'review_id': rev, 'type': ['classified']})
    #   for i in query_id.result:
    #        if len(query_id.result[0]) == 0:
    #            continue
    #    for res in query_id.result[0]:
    #        text = res['review']
    #        local_dump[res['review_id']] = text
    #        for obj in text[0]:
    #            if 'Feature' in obj:
    #                feature = obj['Feature']
    #                for data in feature:
    #                    if 'name' in data:
    #                        temp = {}
    #                        temp['word'] = data['name']
    #                        if 'sentiment' in data:
    #                            temp['sentiment'] = data['sentiment']
    #                        else:
    #                            temp['sentiment'] = ['neutral']
    #                        temp['rev_id'] = res['review_id']
    #                        temp['sentence_id'] = obj['seqno']
    #                        keys.append(temp)
    #modelname = 'sample_model'
    #cwd = os.getcwd()
    #model = word2vec.Word2Vec.load_word2vec_format(cwd+ '/' + modelname+'.bin', binary=True)
    #[vecs, mapping] = generate_vectors(keys, model)
    #clusters = cluster_try(vecs)
    #cluster_data = []
    #features = create_json(clusters, cluster_data, mapping, keys, helpful, local_dump)
    #features = sorted(features, key=lambda k: k['keyword_count'], reverse=True)

    #featureDict = {}
    #featureDict['features'] = features[:10]
    #featureDict['product_name'] = name
    #return featureDict
    ###end#############
    
    
#I think it should look something like this instead
###start######
#getting current directory
curdir = os.getcwd()
logger.debug(curdir)

#loading credentials from .env file
credFilePath = os.path.join(curdir,'..','.env')
config = configparser.ConfigParser()
config.read(credFilePath)
logger.debug(config.sections())

client = cloudant.client.Cloudant(config['CLOUDANT']['CLOUDANT_USERNAME'],
                                  config['CLOUDANT']['CLOUDANT_PASSWORD'],
                                  account=config['CLOUDANT']['CLOUDANT_USERNAME'])

client.connect()
db = client[config['CLOUDANT']['CLOUDANT_DB']]
    
#Steps
#Find all products
#send feature lists to w2v model for clustering
#save the clustered feature list back to db
modelname = 'sample_model'
cwd = os.getcwd()
model = word2vec.Word2Vec.load_word2vec_format(cwd+ '/' + modelname+'.bin', binary=True)
for doc in db:
    if doc['product_name']:
        [vecs, mapping] = generate_vectors(doc['features'], model)
        clusters = cluster_try(vecs)
        #update db with the clusters
###end#######

# 5. Make Final JSON
The script that takes all of these pieces and turns them into the final JSON structure that will be used by a front end application is src/Processing/makeFinalJSON.py

In [None]:
#We might be able to skip this whole step. I did the grouping in an earlier section to make the clustering step simpler.
#If we get that formatted correctly there then I think this can be removed. -Andrew

from watson_developer_cloud import AlchemyLanguageV1
from cloudant.query import Query
from cloudant.client import Cloudant


JSON_FILE = "./Model_Clustering.JSON"
outputJSON = {
    "product_name": "",
    "product_id": None,
    "features": [],
    "issues": {
        "percentage": 0,
        "review_ids": []
    },
    "customer_service": {
        "sentiment": {
            "positive": 0,
            "neutral": 0,
            "negative": 0
        }
    }
}

DB_USERNAME = ''
DB_PASSWORD = ''
DB_ACCOUNT = ''
DATABASE = ''                                               
AL_KEY = ''

client = Cloudant(DB_USERNAME, DB_PASSWORD, account=DB_ACCOUNT)
client.connect()
db = client[DATABASE]

def make_final(cluster, db):
    outputJSON = {"product_name": "","product_id": None,"features": [],"issues": {"percentage": 0,"review_ids": []},"customer_service": {"sentiment": {"positive": 0,"neutral": 0,"negative": 0}}}
    alchemy = AlchemyLanguageV1(api_key=AL_KEY)

    reviewnums = set()
    reviews = []

    for group in cluster['features']:
        for keywords in group['keywords']:
            for review_ids in keywords['review_id']:
                reviewnums.add(review_ids)
    for i in range(0, len(reviewnums)):
        if len(reviewnums) > 0:
            num = reviewnums.pop()
            q = Query(db, selector={'review_id': num})
            for i in q.result():
                if i['type'] == ['classified']:
                    reviews.append(q.result[0])
    reviewnums.clear()
    total = 0
    for review in reviews:
        if review != []:
            for line in review[0]["review"]:
                total = total + 1
                if type(line) == list:
                    line = line[0]
                if type(line) == int:
                    continue
                if(line["layer3type"] == "Issue"):
                    outputJSON["issues"]["percentage"] = outputJSON["issues"]["percentage"] + 1
                    outputJSON["issues"]["review_ids"].append(review[0]["review_id"])
                if(line["layer2type"] == "Customer Service"):
                    sentiment = alchemy.sentiment(text=line["sentence"])["docSentiment"]["type"]
                    outputJSON["customer_service"]["sentiment"][sentiment] = outputJSON["customer_service"]["sentiment"][sentiment] + 1

    outputJSON["issues"]["percentage"] = outputJSON["issues"]["percentage"]/float(total)*100
    customer_service_total = 0
    for sentiment in outputJSON["customer_service"]["sentiment"]:
        customer_service_total = customer_service_total + outputJSON["customer_service"]["sentiment"][sentiment]

    if(customer_service_total > 0):
        for sentiment in outputJSON["customer_service"]["sentiment"]:
            outputJSON["customer_service"]["sentiment"][sentiment] = outputJSON["customer_service"]["sentiment"][sentiment]/float(customer_service_total)*100

    featureArray = cluster["features"]
    total = 0
    for item in featureArray:
        feature = {}
        feature["group_name"] = item["feature"]
        feature["percentage"] = item["keyword_count"]
        total = total + item["keyword_count"]
        feature["sentiments"] = item["sentiments"]
        feature["keywords"] = []
        for i in range(len(item["keywords"])):
            elem = item["keywords"][i]
            keyword = {
                "name": elem["keyword"],
                "review_id": elem["review_id"],
                "sentence_id": elem["sentence_id"]
            }
            feature["keywords"].append(keyword)
        for sent in feature["sentiments"]:
            feature["sentiments"][sent] = feature["sentiments"][sent]/float(item["keyword_count"])*100
        outputJSON["features"].append(feature)

    for item in outputJSON["features"]:
        item["percentage"] = item["percentage"]/float(total)*100

    return outputJSON

client = cloudant.client.Cloudant(DB_USERNAME, DB_PASSWORD, account=DB_ACCOUNT)
client.connect()
db = client[DATABASE]
for doc in db:
  doc['class'] = classify(doc['reviewText'])