<a href="https://colab.research.google.com/github/DawenZhang/online_review_intelligent_kano/blob/filled/product_review_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#@markdown # webpage preparation

#@markdown the webpage zip file path; if you wish to upload file, leave it blank
webpage_zip_path = "https://github.com/DawenZhang/online_review_intelligent_kano/raw/filled/provided/product_2/review_pages.zip" #@param {type:"string"}

from google.colab import files
import zipfile

file_path = ""
if webpage_zip_path == "":
  uploaded = {}
  while len([*uploaded.keys()]) <= 0:
    print("as the path field is left blank, please upload webpage zip file")
    uploaded = files.upload()
  file_path = [*uploaded.keys()][0]
else:
  import urllib.request
  file_path = urllib.request.urlretrieve(webpage_zip_path)[0]

print('webpage received')
zip_ref = zipfile.ZipFile(file_path, 'r')
zip_ref.extractall('webpage/')
zip_ref.close()

In [0]:
import os
import sys
import json
import re
import pandas as pd
from datetime import date
from datetime import datetime

#@markdown the folder containing webpage files, no ending slash
folder_name = "review_pages" #@param {type:"string"}

webpage_names = os.listdir('webpage/' + folder_name + '/')

dataset = []

comment_count = 1
for webpage_name in webpage_names:
  with open('webpage/' + folder_name + '/' + webpage_name, 'r', encoding = 'utf-8') as webpage_file:
    start = 0
    end = 0
    review_id = None
    product_id = None
    review_content = None
    rating = None
    date = None
    helpful_vote = None
    verified_purchase = None
    top_reviewer = None
    title = None
    bought_model = None
    is_anomaly = None
    
    webpage_line = webpage_file.readline()
    while webpage_line:
      review_id = comment_count
      product_id = 1
      webline = webpage_line.strip()
      p = re.compile('cm_cr-review_list')
      m = p.search(webline)
      if m != None:
        start = 1
        end = 0
      if start == 1 and end == 0:
        p = re.compile('a-form-actions a-spacing-top-extra-large')
        m = p.search(webline)
        if m != None:
          end = 1
        else:
          p = re.compile('<i data-hook="review-star-rating" class="a-icon a-icon-star a-star-([0-5]*?) review-rating">')
          m = p.search(webline)
          if m != None:
            rating = int(m.group(1))

          p = re.compile('<a data-hook="review-title" class="a-size-base a-link-normal review-title a-color-base a-text-bold" href=".*?">(.*?)</a>')
          m = p.search(webline)
          if m != None:
            title = m.group(1)

          p = re.compile('badge-top-1000-reviewer')
          m = p.search(webline)
          if m != None:
            top_reviewer = True
            
          p = re.compile('class="a-size-mini a-link-normal a-color-secondary" href=".*?">(.*?)</a>')
          m = p.search(webline)
          if m != None:
            bought_model = m.group(1)
            
          p = re.compile('class="a-size-mini a-color-state a-text-bold">Verified Purchase</span>')
          m = p.search(webline)
          if m != None:
            verified_purchase = True
            
          p = re.compile('<span data-hook="review-body" class="a-size-base review-text">(.*?)</span></div>')
          m = p.search(webline)
          if m != None:
            review_content = m.group(1)
            
          p = re.compile('<span data-hook="helpful-vote-statement" class="a-size-base a-color-tertiary cr-vote-text">(.*?) people found this helpful</span>')
          m = p.search(webline)
          if m != None:
            helpful_vote = int(m.group(1).replace(',', ''))
            
          p = re.compile('<span data-hook="review-date" class="a-size-base a-color-secondary review-date">(.*?)</span>')
          m = p.search(webline)
          if m != None:
            date = datetime.strptime(m.group(1), '%B %d, %Y')
          
          p = re.compile('cr-footer-line-height')
          m = p.search(webline)
          if m != None:
            dataset.append((review_id, product_id, review_content, rating, date, helpful_vote, verified_purchase, top_reviewer, title, bought_model, is_anomaly))
            review_id = None
            product_id = None
            review_content = None
            rating = None
            date = None
            helpful_vote = None
            verified_purchase = None
            top_reviewer = None
            title = None
            bought_model = None
            is_anomaly = None
            comment_count += 1
            
      if start == 1 and end == 1:
        break
      webpage_line = webpage_file.readline()
    webpage_file.close()
    
dataframe = pd.DataFrame(data = dataset, columns = ['review_id', 'product_id', 'review_content', 'rating', 'date', 'helpful_vote', 'verified_purchase', 'top_reviewer', 'title', 'bought_model', 'is_anomaly'])

In [0]:
import os
import urllib.request

#@markdown google nlp credential url; if you wish to upload file, leave it blank
#@markdown if no credential owned, please generate one from Google Cloud Console 
#@markdown **it is strongly unrecommended to expose your credential file on the web**
google_nlp_credential_web_path = "" #@param {type:"string"}

credential_file_path = ""
if google_nlp_credential_web_path == "":
  from google.colab import files
  uploaded = {}
  while len([*uploaded.keys()]) <= 0:
    print("as the path field is left blank, please upload credential file")
    uploaded = files.upload()
  credential_file_path = [*uploaded.keys()][0]
else:
  import urllib.request
  credential_file_path = urllib.request.urlretrieve(google_nlp_credential_web_path)[0]
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_file_path

In [0]:
#@markdown # Google NLP Analysis
#@markdown run to process

entity_dataset = []
mention_dataset = []
document_dataset = []
sentence_dataset = []
error_reviews = []

import os
import sys
import argparse
import json
from time import sleep

from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
from google.protobuf import json_format
from google.cloud.language_v1.types import AnalyzeEntitySentimentResponse
from google.cloud.language_v1.types import AnalyzeSentimentResponse

entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION', 'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')
mention_type = ('TYPE_UNKNOWN', 'PROPER', 'COMMON')

client = language.LanguageServiceClient()

meta_count = 0
total_count = len(dataset)

for meta_review in dataset:
  
  if meta_count % round(total_count / 10) == 0:
    print("analyzing: ", str(meta_count), "/", str(total_count))
  meta_count += 1
  
  sleep(0.2)
  document = types.Document(
      content = meta_review[2].encode('utf-8'),
      type = enums.Document.Type.PLAIN_TEXT
  )

  encoding = enums.EncodingType.UTF32
  if sys.maxunicode == 65535:
    encoding = enums.EncodingType.UTF16
  
  try:
    entity_result = client.analyze_entity_sentiment(document, encoding)
    
    for entity_num_from_0 in range(len(entity_result.entities)):
      entity_id = entity_num_from_0 + 1
      entity_dataset.append((meta_review[0], meta_review[1], entity_id, entity_result.entities[entity_num_from_0].salience, entity_result.entities[entity_num_from_0].sentiment.score, entity_result.entities[entity_num_from_0].sentiment.magnitude, entity_result.entities[entity_num_from_0].name, entity_type[entity_result.entities[entity_num_from_0].type]))
      
      for mention in entity_result.entities[entity_num_from_0].mentions:
        mention_dataset.append((meta_review[0], meta_review[1], entity_id, mention.text.begin_offset, mention.sentiment.score, mention.sentiment.magnitude, mention.text.content, mention_type[mention.type]))
    
  except Exception as e:
    error_reviews.append((meta_review[0], meta_review[1]))
  
  try:
    document_result = client.analyze_sentiment(document, encoding)
    document_dataset.append((meta_review[0], meta_review[1], document_result.document_sentiment.score, document_result.document_sentiment.magnitude))
    
    for sentence in document_result.sentences:
      sentence_dataset.append((meta_review[0], meta_review[1], sentence.text.begin_offset, sentence.text.content, sentence.sentiment.score, sentence.sentiment.magnitude))
  except Exception as e:
    error_reviews.append((meta_review[0], meta_review[1]))

print('the following reviews(review_id, product_id) have got errors when being processed by Google Cloud NLP: ', error_reviews)

entity_dataframe = pd.DataFrame(data = entity_dataset, columns = ['review_id', 'product_id', 'entity_id', 'salience', 'sentiment_score', 'sentiment_magnitude', 'name', 'type'])
mention_dataframe = pd.DataFrame(data = mention_dataset, columns = ['review_id', 'product_id', 'entity_id', 'begin_offset', 'sentiment_score', 'sentiment_magnitude', 'content', 'type'])
sentence_dataframe = pd.DataFrame(data = sentence_dataset, columns = ['review_id', 'product_id', 'begin_offset', 'content', 'sentiment_score', 'sentiment_magnitude'])
document_dataframe = pd.DataFrame(data = document_dataset, columns = ['review_id', 'product_id', 'sentiment_score', 'sentiment_magnitude'])

In [0]:
#@markdown tick and run to download datasheet
download_datasheet = True #@param {type:"boolean"}
if download_datasheet == True:
  
  from google.colab import files
  
  dataframe.to_csv("customer_reviews.csv", index = False, header = True)
  files.download('customer_reviews.csv')
  
  entity_dataframe.to_csv('entity_sentiment.csv',index=False,header=True)
  files.download('entity_sentiment.csv')
  
  mention_dataframe.to_csv('mention_sentiment.csv',index=False,header=True)
  files.download('mention_sentiment.csv')
  
  sentence_dataframe.to_csv('sentence_sentiment.csv',index=False,header=True)
  files.download('sentence_sentiment.csv')
  
  document_dataframe.to_csv('document_sentiment.csv',index=False,header=True)
  files.download('document_sentiment.csv')