<a href="https://colab.research.google.com/github/DawenZhang/online_review_intelligent_kano/blob/filled/product_review_anomaly_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#@markdown # data preparation

#@markdown the product used for anomaly detection
product_id = 1 #@param {type:"string"}


import pandas as pd


#@markdown the datasheet path; if you wish to upload files, leave them blank
customer_reviews_datasheet_path = "https://github.com/DawenZhang/online_review_intelligent_kano/raw/filled/provided/product_2/customer_reviews.csv" #@param {type:"string"}
document_sentiment_datasheet_path = "https://github.com/DawenZhang/online_review_intelligent_kano/raw/filled/provided/product_2/document_sentiment.csv" #@param {type:"string"}

if customer_reviews_datasheet_path == "":
  from google.colab import files
  uploaded = {}
  while len([*uploaded.keys()]) <= 0:
    print("as the path field is left blank, please upload customer_reviews_datasheet")
    uploaded = files.upload()
  import io
  customer_reviews = pd.read_csv(io.StringIO(uploaded[[*uploaded.keys()][0]].decode('utf-8')))
else:
  customer_reviews = pd.read_csv(customer_reviews_datasheet_path)

  
if document_sentiment_datasheet_path == "":
  from google.colab import files
  uploaded = {}
  while len([*uploaded.keys()]) <= 0:
    print("as the path field is left blank, please upload document_sentiment_datasheet")
    uploaded = files.upload()
  import io
  document_sentiment = pd.read_csv(io.StringIO(uploaded[[*uploaded.keys()][0]].decode('utf-8')))
else:
  document_sentiment = pd.read_csv(document_sentiment_datasheet_path)

  
crds = customer_reviews.merge(document_sentiment, on = ["review_id", "product_id"])


#@markdown the rating used for anomaly detection
rating_for_detection = "3" #@param [1, 2, 3, 4, 5]

rating_for_detection = int(rating_for_detection)

rating_reviews = []
for i in range(rating_for_detection - 1, rating_for_detection):
    rating_review = crds.sort_values(by = ['date']).loc[(crds['rating'] == i + 1) & (crds['product_id'] == product_id), ['review_id', 'sentiment_score', 'sentiment_magnitude']]
    rating_reviews.append(rating_review.values.tolist())
    

import numpy as np
from scipy.stats import multivariate_normal
from scipy.stats import norm


washed_reviews = []

#@markdown percentage used for normal distribution ppf
percentage_for_distribution = 1 #@param {type:"slider", min:0.01, max:1, step:0.01}


#@markdown update the is_anomaly field of the datasheet and download
download_updated_datasheet = True #@param {type:"boolean"}

for single_rating_reviews in rating_reviews:
    
    single_rating_array = np.array(single_rating_reviews)
    single_sentiment_array = single_rating_array[:, [1, 2]]
    mean_score_magnitude = np.mean(single_sentiment_array.astype(float), axis = 0)
    std_score_magnitude = np.std(single_sentiment_array.astype(float), axis = 0)
    
    mu_score = mean_score_magnitude[0]
    std_score = std_score_magnitude[0]
    variance_score = std_score ** 2

    mu_magnitude = mean_score_magnitude[1]
    std_magnitude = std_score_magnitude[1]
    variance_magnitude = std_magnitude ** 2

    distribution = multivariate_normal([mu_score, mu_magnitude], [[variance_score, 0], [0, variance_magnitude]])
    distribution_interval_score = norm.ppf((1 + percentage_for_distribution) / 2, loc = mu_score, scale = std_score)
    distribution_height_boundary = distribution.pdf([distribution_interval_score, mu_magnitude])
    
    for single_review in single_rating_reviews:
        if distribution.pdf([distribution_interval_score, mu_magnitude]) <= distribution.pdf([single_review[1], single_review[2]]):
            washed_reviews.append(single_review[0])
            

customer_reviews.loc[(customer_reviews['review_id'].isin(washed_reviews)) & (customer_reviews['product_id'] == product_id), 'is_anomaly'] = False
customer_reviews.loc[(~customer_reviews['review_id'].isin(washed_reviews)) & (customer_reviews['product_id'] == product_id), 'is_anomaly'] = True


if download_updated_datasheet == True:
  customer_reviews.to_csv("customer_reviews.csv", index = False, header = True)
  from google.colab import files
  files.download('customer_reviews.csv')

In [0]:
#@markdown # plotting

#@markdown axis ranges
score_axis_range_max = 1 #@param {type: "slider", min: 0, max: 1, step: 0.1}
score_axis_range_min = -score_axis_range_max

magnitude_axis_range_min = 0
magnitude_axis_range_max = 15 #@param {type: "slider", min: 0, max: 99, step: 0.1}

density_axis_range_min = 0
density_axis_range_max = 0.4 #@param {type: "slider", min: 0, max: 1, step: 0.01}

x = np.linspace(score_axis_range_min, score_axis_range_max, 500)
y = np.linspace(magnitude_axis_range_min, magnitude_axis_range_max, 500)
X, Y = np.meshgrid(x, y)
pos = np.empty(X.shape + (2, ))
pos[:, :, 0] = X; pos[:, :, 1] = Y


import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import matplotlib.cm as cm
import matplotlib.ticker as ticker


fig = plt.figure(figsize=(16, 12))
ax = fig.gca(projection = '3d')


#@markdown the datasheet path
plot_title = "normal distribution" #@param {type:"string"}
score_label = "Sentiment Score" #@param {type:"string"}
magnitude_label = "Sentiment Magnitude" #@param {type:"string"}
density_label = "Probability Density" #@param {type:"string"}

ax.set_title(plot_title)
ax.set_xlabel(score_label)
ax.xaxis.set_major_locator(ticker.MultipleLocator(0.10))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
ax.set_ylabel(magnitude_label)
ax.set_zlabel(density_label)

ax.tick_params(axis = 'both', which = 'major', labelsize = 6)
ax.tick_params(axis = 'both', which = 'minor', labelsize = 6)
ax.set_xlim(score_axis_range_min, score_axis_range_max)
ax.set_ylim(magnitude_axis_range_min, magnitude_axis_range_max)
ax.set_zlim(density_axis_range_min, density_axis_range_max)
ax.contour(X, Y, distribution.pdf(pos), levels = [
           distribution_height_boundary], linewidths = 3, colors = '#ff0077', linestyles= "solid", offset = distribution_height_boundary)

norm = plt.Normalize(distribution.pdf(pos).min(), distribution.pdf(pos).max())
colors = cm.plasma(norm(distribution.pdf(pos)))

ax.plot_surface(X, Y, distribution.pdf(pos), rcount = 30, ccount = 40, facecolors = colors, shade = False, linewidth = 0.8).set_facecolors((1, 1, 1, 0))


#@markdown mark a point on the graph; drag to max values to hide the point
mark_point_score = 1 #@param {type: "slider", min: -1, max: 1, step: 0.1}
mark_point_magnitude = 99 #@param {type: "slider", min: 0, max: 99, step: 0.1}
mark_point_distribution = distribution.pdf([mark_point_score, mark_point_magnitude])

if mark_point_score < 1 and mark_point_magnitude < 99:
  ax.scatter(np.array([mark_point_score]), np.array([mark_point_magnitude]), np.array([mark_point_distribution]),
             c = '#ff0077', marker = 'o', s = 30, depthshade = False)


#@markdown the visibility of the tickers
ticker_visibility = True #@param {type:"boolean"}

if ticker_visibility == False:
  plt.setp(ax.get_xticklabels(), visible = False)
  plt.setp(ax.get_yticklabels(), visible = False)
  plt.setp(ax.get_zticklabels(), visible = False)

  
#@markdown angle x-y
view_angle_xy = 50 #@param {type: "slider", min: 0, max: 90, step: 0.1}
#@markdown angle z
view_angle_z = 150 #@param {type: "slider", min: 0, max: 360, step: 1}
ax.view_init(view_angle_xy, view_angle_z)

plt.show()