## Import libraries

In [9]:
# basic libraries
import json
import nltk
import string
import os

# nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# collections
from collections import defaultdict
from collections import Counter

# elasticsearch
from elasticsearch import Elasticsearch

# alchemy for language analysis
from alchemyapi import AlchemyAPI

# initialization
DATA_DIR = '/home/rt/wrk/w209/yelp/data'

## Data Cleansing - Tokenize and Stemming using NLTK

In [10]:
# utility function to parse and return json
def process_json(filename):
    result = []

    infile = open(filename,"r")

    for line in infile:
        temp = json.loads( line.strip() )
        if 'create' in temp:
            continue
        result.append(temp)

    return result

In [37]:
def process_review(file_name):
    print "processing file name {}".format(file_name)

    reviews = process_json(os.path.join(DATA_DIR,file_name))
    print "# of reviews: {}".format(len(reviews))

    ids_hash = defaultdict(list)
    #ids = ['4bEjOyTaDG24SY5TxsaUNQ']
    ids = ['6imLt53br7SJ3av07jjH7w', '41j3GB7M-Lwq284Pfb9zgw']

    reviews_text = ""

    # initialize tokenizer and stemmer
    tokenization_pattern = r'''(?x)    # set flag to allow verbose regexps
    ([A-Z]\.)+        # abbreviations, e.g. U.S.A.
    | \w+(-\w+)*        # words with optional internal hyphens
    | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
    | \w+[\x90-\xff]  # these are escaped emojis
    | [][.,;"'?():-_`]  # these are separate tokens
    '''
    word_tokenizer = nltk.tokenize.regexp.RegexpTokenizer(tokenization_pattern)
    porter_stemmer = PorterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer()

    for item in reviews:
        tokens = word_tokenizer.tokenize(item['text'].lower())
        normalized = [w for w in tokens if w.lower() not in stopwords.words('english')]
        #stemmed_reviews.append(' '.join(map(porter_stemmer.stem, normalized)))
        treated_review = ' '.join(map(wordnet_lemmatizer.lemmatize, normalized))
        ids_hash[item['business_id']].append(treated_review)
        
    print "Length of treated reviews = {}".format(len(ids_hash))
    
    nlp_process(ids, ids_hash)
    #print stemmed_review_blob

## Sentiment Analysis and Topic Modeling using Alchemy API

In [16]:
def nlp_process(ids,ids_hash):
    #instantiate an elasticsearch client
    #es = Elasticsearch()
    outfile = open('business_level.txt', 'w')

    #instantiate an alchemy client
    alchemyapi = AlchemyAPI()

    for item in ids:
        data = ' '.join(ids_hash[item])
        alchem_keywords = []
        alchem_concepts = []

        response = alchemyapi.combined('text', data, {'sentiment': 1, 'maxRetrieve': 100})

        if len(data) > 0:
            if response['status'] == 'OK':
                print('#Success#')
                for keyword in response['keywords']:
                    al_temp = defaultdict()

                    al_temp['text'] = keyword['text'].encode('utf-8')
                    al_temp['relevance'] = keyword['relevance']
                    al_temp['sentiment'] = keyword['sentiment']['type']

                    if 'score' in keyword['sentiment']:
                        al_temp['score'] = keyword['sentiment']['score']

                    alchem_keywords.append(al_temp)

                for keyword in response['concepts']:
                    al_temp = defaultdict()

                    al_temp['text'] = keyword['text'].encode('utf-8')
                    al_temp['relevance'] = keyword['relevance']

                    alchem_concepts.append(al_temp)

            else:
                print('Error in keyword extaction call: ', response['statusInfo'])
            print len(alchem_keywords), len(alchem_concepts)

            # prepare body for insertion
            doc = {
                "business_id" : item,
                "word_freq": alchem_keywords,
                "topics": alchem_concepts
            }

        #exit()
        
        #template = { "create": { "_index": "alchem", "_type": "doc"} }
        #res = es.index(index="alchem", doc_type='doc', body=doc)
        
            json.dump(doc, outfile)

In [33]:
nlp_process(ids, ids_hash)

#Success#
99 8
#Success#
100 8


## Results

In [35]:
def nlp_results(file_name):
    infile = open(file_name, "r")
    
    for line in infile.readlines():
        line = json.loads( line.strip() )
        
        print "="*80
        print "business_id : {}".format(line['business_id'])
        print "="*80

        print "-"*80
        print "concepts"
        print "-"*80
        print "{0: <50} | {1}".format('text', 'relevance')
        for items in line['topics']:
            #print items
            print "{0: <50} | {1}".format(items['text'].encode('utf-8'), items['relevance'])
        print "-"*80

        print "keywords"    
        print "-"*80
        print "{0: <50} | {1: <10} | {2: <10} | {3}".format('text', 'sentiment', 'score', 'relevance')
        print "-"*80
        for items in line['word_freq']:
            if items['sentiment'] in ['positive', 'negative']:
                print "{0: <50} | {1: <10} | {2: <10} | {3}".format(items['text'], items['sentiment'], items.get('score', 0.0), items['relevance'])

        print "-"*80

In [36]:
nlp_results('business_level.txt')

business_id : 6imLt53br7SJ3av07jjH7w
--------------------------------------------------------------------------------
concepts
--------------------------------------------------------------------------------
text                                               | relevance
Bread                                              | 0.907779
Bar                                                | 0.888896
Anchovy                                            | 0.803765
Pizza                                              | 0.746982
Place identity                                     | 0.746041
Italian cuisine                                    | 0.741774
Elevator                                           | 0.720353
Phoenix, Arizona                                   | 0.718297
--------------------------------------------------------------------------------
keywords
--------------------------------------------------------------------------------
text                                               | sentiment