# Processing Flow

The processing flow is run on all of the review data once the models are all trained up and validated. This is the flow that will actually turn the raw data into processed output.

# 1. Token Replacement

This step allows Alchemy to replace words in the sentences of the reviews by their semantic types (product, customer_service, company, etc). These semantic types were defined when the WKS model was trained and are usually associated with a given domain data.

It replaces tokens from the 'reviewText' field of the documents stored in the database. It saves the replaced sentences to the 'taggedRevie' field in the same document.

In [None]:
import re
import os
import logging
import configparser
import cloudant
from watson_developer_cloud import AlchemyLanguageV1

logger = logging.getLogger()
logger.setLevel(logging.INFO)

#getting current directory
curdir = os.getcwd()
logger.debug(curdir)

#loading credentials from .env file
credFilePath = os.path.join(curdir,'..','.env')
config = configparser.ConfigParser()
config.read(credFilePath)
logger.debug(config.sections())

model_id = config['WKS']['WKS_MODEL_ID']
alchemy_api = AlchemyLanguageV1(api_key = 
                    config['ALCHEMY']['ALCHEMY_API_KEY'])


def get_entities(review):
    #logger.debug(review)
    response = ''
    try:
        response = alchemy_api.entities(text=str(review), model=model_id)
    except:
        logger.error("Error when getting entities.")
    logger.debug("Result from entities call: "+str(response))
    return response

def get_sentiment(review):
    response = alchemy_api.sentiment(text=str(review))
    logger.debug("Result from sentiment call: "+str(response))
    if 'docSentiment' in response:
        if 'type' in response['docSentiment']:
            return response['docSentiment']['type']
        else:
            return " "
    else:
        return " "

def token_replacement_entities(review):
    entities_response = get_entities(review)
    features = []
    if 'entities' in entities_response:
        if len(entities_response['entities']) == 0:
            logger.debug("ZERO entities found, returning review: "+str(review))
            return review, features
        else:
            entities = entities_response['entities']
            logger.debug("List of entities found: "+str(entities))
            for i in entities:
                token = i['text']
                if ('Feature' in i['type']):
                    features.append(token)
                classification = "<" + i['type'] + ">"
                token = re.escape(token)
                re.sub(r'\\ ', ' ', token)
                review = re.sub(r"\b%s\b" % token, classification, review, count=1)
            return review, features
    else:
        return review, features
    
#Initializing Cloudant client
client = cloudant.client.Cloudant(config['CLOUDANT']['CLOUDANT_USERNAME'],
                                  config['CLOUDANT']['CLOUDANT_PASSWORD'],
                                  account=config['CLOUDANT']['CLOUDANT_USERNAME'])

#Going through all the documents and replacing the tokens by
#their semantic types. Result is save back to the Cloudant document
#in the 'taggedReview' field.
client.connect()
db = client[config['CLOUDANT']['CLOUDANT_DB']]
counter = 0
for doc in db:
    counter +=1
    #logger.debug(doc)
    try:
        doc['taggedReview'], doc['features'] = \
            token_replacement_entities(doc['reviewText'])
        doc.save()
    except:
        logger.error('Error saving tagged review to Cloudant document.')
    try:
        doc['sentiment'] = get_sentiment(doc['reviewText'])
        doc.save()
        logger.debug(doc['sentiment'])
    except:
        logger.error('Error saving sentiment to Cloudant document.')
    #if (counter == 2):
    #    break

# 2. Classification of reviews

This step uses the Natural Language Classifier (NLC) created on the Training notebook. This step classifies a review and adds the result of the classification to a new field (called 'class') of the document in the database.

In [None]:
import json
from watson_developer_cloud import NaturalLanguageClassifierV1
import sys
import os
import logging
import configparser
import cloudant

logger = logging.getLogger()
logger.setLevel(logging.INFO)

#getting current directory
curdir = os.getcwd()
logger.debug(curdir)

#loading credentials from .env file
credFilePath = os.path.join(curdir,'..','.env')
config = configparser.ConfigParser()
config.read(credFilePath)

NLC_USERNAME = config['NLC']['NLC_USERNAME']
NLC_PASSWORD = config['NLC']['NLC_PASSWORD']
NLC_CLASSIFIER = config['NLC']['NLC_CLASSIFIER']

#initializing classifier object
nlc = NaturalLanguageClassifierV1(username=NLC_USERNAME, 
                                  password=NLC_PASSWORD)

def classify(review):
    logger.debug(review)
    #Classify sentence
    try:
        response = nlc.classify(NLC_CLASSIFIER, review)
        logger.debug(response)
        if len(response['classes']) > 1:
            return response['classes']
    except:
        logger.error('Failed at sentence classification')
        return 'no class'

#Initializing Cloudant client
client = cloudant.client.Cloudant(config['CLOUDANT']['CLOUDANT_USERNAME'],
                                  config['CLOUDANT']['CLOUDANT_PASSWORD'],
                                  account=config['CLOUDANT']['CLOUDANT_USERNAME'])

client.connect()
db = client[config['CLOUDANT']['CLOUDANT_DB']]
for doc in db:
    logger.debug(doc)
    try:
        doc['class'] = classify(doc['taggedReview'])
        doc.save()
    except:
        logger.error('Error saving classification to Cloudant document.')

# 3. Grouping products

This script will create a product document in the database for each product that has been reviewed. It will also attach a list of the review id's to the new product document.


In [54]:
import cloudant
import os
import logging
import configparser
import couchdb
import json
import ast
from sets import Set
from flask import jsonify

#getting current directory
curdir = os.getcwd()
logger.debug(curdir)

#loading credentials from .env file
credFilePath = os.path.join(curdir,'..','.env')
config = configparser.ConfigParser()
config.read(credFilePath)

client = cloudant.client.Cloudant(config['CLOUDANT']['CLOUDANT_USERNAME'],
                                  config['CLOUDANT']['CLOUDANT_PASSWORD'],
                                  account=config['CLOUDANT']['CLOUDANT_USERNAME'])
client.connect()
db = client[config['CLOUDANT']['CLOUDANT_DB']]

#Using couchdb instead of Cloudant
couch = couchdb.Server("https://%s.cloudant.com" % config['CLOUDANT']['CLOUDANT_USERNAME'])
couch.resource.credentials = (config['CLOUDANT']['CLOUDANT_USERNAME'], config['CLOUDANT']['CLOUDANT_PASSWORD'])

if couch['products'] is not None:
    couch.delete('products')
db_output = couch.create('products')

#Creating a dictionary to store the metadata about each product
products = {}
allFeatures = Set()
for doc in db:
    if doc['title']:
        if doc['title'] not in products:
            products[doc['title']] = {}
            products[doc['title']]['product_name'] = doc['title']
            products[doc['title']]['customer_service'] = {'sentiment': {}}
            products[doc['title']]['customer_service']['sentiment']['posCount'] = 0
            products[doc['title']]['customer_service']['sentiment']['neuCount'] = 0
            products[doc['title']]['customer_service']['sentiment']['negCount'] = 0
            products[doc['title']]['features'] = doc['features']
            products[doc['title']]['issues'] = {'percentage': 0,'review_ids': []}
            products[doc['title']]['reviewCount'] = 0
            products[doc['title']]['product_id'] = doc['asin']
    
        products[doc['title']]['reviewCount'] += 1
        #add logic to check for issue above a certain threshold and update the count and review id
        if 'class' in doc:
            classes = doc['class']
            for classification in classes:
                if ("class_name" in classification):
                    classification = dict(classification)
                    if (classification['class_name'] == "Issue") and (classification['confidence'] > 0.1):
                        products[doc['title']]['issues']['review_ids'].append(doc['_id'])
        #updating the sentiment count
        if 'sentiment' in doc:
            if doc['sentiment'] == 'positive':
                products[doc['title']]['customer_service']['sentiment']['posCount'] += 1
            elif doc['sentiment'] == 'neutral':
                products[doc['title']]['customer_service']['sentiment']['neuCount'] += 1
            elif doc['sentiment'] == 'negative':
                products[doc['title']]['customer_service']['sentiment']['negCount'] += 1
        #updating the list of features
        if 'features' in doc:
            classes = doc['class']
            for classification in classes:
                if ("class_name" in classification):
                    classification = dict(classification)
                    if (classification['class_name'] == "Feature") and (classification['confidence'] > 0.3):
                        for feature in doc['features']:
                            allFeatures.add(feature)
                        for feature in products[doc['title']]['features']:
                            allFeatures.add(feature)
                        products[doc['title']]['features'] = list(allFeatures)
                        allFeatures.clear()
    
#Pushing products to Cloudant database
for product, value in products.items():
    value['issues']['percentage'] = round(float(len(value['issues']['review_ids'])) /\
                                          value['reviewCount'], 3)
    db_output.save(value)


INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): 204f49bc-b226-413d-8dcf-aece9c16ce89-bluemix.cloudant.com


# 4. Clustering product feature sentences

The script that runs the clustering on the review data is src/Processing/clustering.py
A word2vec model is provided

In [56]:
from gensim.models import word2vec
import cloudant
import logging
import re
import os
import configparser

#getting current directory
curdir = os.getcwd()
logger.debug(curdir)

#loading credentials from .env file
credFilePath = os.path.join(curdir,'..','.env')
config = configparser.ConfigParser()
config.read(credFilePath)

client = cloudant.client.Cloudant(config['CLOUDANT']['CLOUDANT_USERNAME'],
                                  config['CLOUDANT']['CLOUDANT_PASSWORD'],
                                  account=config['CLOUDANT']['CLOUDANT_USERNAME'])
client.connect()
products_db = client['products']
reviews_db = client[config['CLOUDANT']['CLOUDANT_DB']]

#Please provide the path to the word2vec model you created. 
#The path provided by default points to the available sample
#model file
W2V_MODEL = os.path.join(curdir,'..','data','sample_model.bin')


def generate_vectors(features, model):
    vecs = []
    mapping = []
    count = 0
    for line in features:
        words = line.split()
        vec = []
        flag = 0
        for word in words:
            word = str(word)
            word = re.escape(word)
            word = re.sub(r'\\', '', word)
            if word in model:
                if len(vec) > 1:
                    vec = vec+model[word]
                else:
                    vec = model[word]
            else:
                flag = 1
                break
        if flag == 0:
            if len(vec) > 0:
                vecs.append(vec)
                mapping.append(count)
        count += 1
    return [vecs, mapping]

def cluster_try(vecs):
    clusterVec = {}
    clusterIdx = {}
    no_of_clusters = 1
    clusterIdx[0] = [0]
    clusterVec[0] = vecs[0]
    max_sim = 0.8
    index = 0
    for i in range(1, len(vecs)):
        flag = 0
        for j in range(no_of_clusters):
            sim = np.dot(vecs[i], clusterVec[j])/\
                (np.linalg.norm(clusterVec[j]) * \
                 np.linalg.norm(vecs[i]))
            if sim > max_sim:
                flag = 1
                max_sim = sim
                index = j
        if flag == 0:
            clusterIdx[j+1] = [i]
            clusterVec[j+1] = vecs[i]
            no_of_clusters += 1
        else:
            clusterIdx[index].append(i)
            clusterVec[index] += vecs[i]
    return clusterIdx
   
model = word2vec.Word2Vec.load_word2vec_format(W2V_MODEL, binary=True)
for doc in products_db:
    clusters = []
    if len(doc['features']) > 0:
        [vecs, mapping] = generate_vectors(doc['features'], model)
        clusters = cluster_try(vecs)
        doc['clusters'] = clusters
        doc.save()


INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): 204f49bc-b226-413d-8dcf-aece9c16ce89-bluemix.cloudant.com
INFO:gensim.models.word2vec:loading projection weights from /Users/priscillamoraes/git/product-intelligence/notebooks/../data/sample_model.bin
INFO:gensim.models.word2vec:loaded (71290, 200) matrix from /Users/priscillamoraes/git/product-intelligence/notebooks/../data/sample_model.bin
