<a href="https://colab.research.google.com/github/DawenZhang/online_review_intelligent_kano/blob/filled/product_review_correlation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#@markdown # product feature selection

from collections import OrderedDict
import numpy as np

product_feature_1 = "filter" #@param {type:"string"}

product_feature_2 = "warranty" #@param {type:"string"}

product_feature_3 = "taste" #@param {type:"string"}

product_feature_4 = "customer service" #@param {type:"string"}

product_feature_5 = "carafe/pitcher" #@param {type:"string"}

product_feature_6 = "travel mug" #@param {type:"string"}

product_feature_7 = "chamber/tank" #@param {type:"string"}

product_feature_8 = "cleaning" #@param {type:"string"}

product_feature_9 = "pump" #@param {type:"string"}

product_feature_10 = "water reservoir" #@param {type:"string"}

product_features = OrderedDict({
    product_feature_1: product_feature_1.split("/"),
    product_feature_2: product_feature_2.split("/"),
    product_feature_3: product_feature_3.split("/"),
    product_feature_4: product_feature_4.split("/"),
    product_feature_5: product_feature_5.split("/"),
    product_feature_6: product_feature_6.split("/"),
    product_feature_7: product_feature_7.split("/"),
    product_feature_8: product_feature_8.split("/"),
    product_feature_9: product_feature_9.split("/"),
    product_feature_10: product_feature_10.split("/")
})

In [0]:
#@markdown # data preparation

feature_vocabulary = []
for feature_name in product_features:
    for meta_feature_name in product_features[feature_name]:
        feature_vocabulary.append(meta_feature_name)
       
      
def check_features(something, product_features):
    feature_list = []
    for i in product_features:
        for feature_tag in product_features[i]:
            if something.lower() == feature_tag:
                feature_list.append(i)
    return feature_list
  
  
#@markdown the product used for anomaly detection
product_id = 1 #@param {type:"integer"}

import pandas as pd

#@markdown the datasheet path; if you wish to upload files, leave them blank
customer_reviews_datasheet_path = "https://github.com/DawenZhang/online_review_intelligent_kano/raw/filled/provided/product_2/customer_reviews.csv" #@param {type:"string"}
entity_sentiment_datasheet_path = "https://github.com/DawenZhang/online_review_intelligent_kano/raw/filled/provided/product_2/entity_sentiment.csv" #@param {type:"string"}

if customer_reviews_datasheet_path == "":
  from google.colab import files
  uploaded = {}
  while len([*uploaded.keys()]) <= 0:
    print("as the path field is left blank, please upload customer_reviews_datasheet")
    uploaded = files.upload()
  import io
  customer_reviews = pd.read_csv(io.StringIO(uploaded[[*uploaded.keys()][0]].decode('utf-8')))
else:
  customer_reviews = pd.read_csv(customer_reviews_datasheet_path)
  
customer_reviews['review_content'] = customer_reviews['review_content'].astype(str)

product_reviews = customer_reviews.loc[customer_reviews['product_id'] == product_id, ['review_id', 'review_content', 'rating']].sort_values(by = ['review_id'])

replaced_product_reviews = []
for r_index, review in product_reviews.iterrows():
    replaced_review = review[1]
    for pf_name in product_features:
      if len(product_features[pf_name]) > 1:
        for pf_e_name in product_features[pf_name][1:]:
          replaced_review = replaced_review.replace(pf_e_name, product_features[pf_name][0])
    replaced_product_reviews.append(replaced_review)

from sklearn.feature_extraction.text import TfidfVectorizer

tf_features = []

for feature_name in product_features:
    tf_features.append(product_features[feature_name][0])
    
vectorizer = TfidfVectorizer(vocabulary = tf_features, lowercase=True, ngram_range=(1, 2))
tfidf = vectorizer.fit_transform(replaced_product_reviews)

feature_entities = {}


if entity_sentiment_datasheet_path == "":
  from google.colab import files
  uploaded = {}
  while len([*uploaded.keys()]) <= 0:
    print("as the path field is left blank, please upload entity_sentiment_datasheet")
    uploaded = files.upload()
  import io
  entity_sentiment = pd.read_csv(io.StringIO(uploaded[[*uploaded.keys()][0]].decode('utf-8')))
else:
  entity_sentiment = pd.read_csv(entity_sentiment_datasheet_path)

for review_num in range(len(replaced_product_reviews)):
    current_all_entities = entity_sentiment.loc[
                                                (entity_sentiment['review_id'] == product_reviews.iloc[review_num]['review_id']) 
                                                & (entity_sentiment['product_id'] == product_id), 
                                                ['name', 'sentiment_score', 'sentiment_magnitude']
                                                ]
    current_entities = {}
    
    for e_index, entity in current_all_entities.iterrows():
        
        features = check_features(entity['name'], product_features)

        if len(features) > 0:

#             if entity['sentiment_magnitude'] == 0:
#                 continue
            
            pending_entity = []
            
            pending_entity.append(tfidf[review_num, [*product_features].index(features[0])])
            pending_entity.append(entity[1])
            pending_entity.append(entity[2])
            
            for single_feature in features:
                if single_feature not in current_entities:
                    current_entities[single_feature] = []
                current_entities[single_feature].append(pending_entity)
                
    for feature_name in current_entities:
        if feature_name not in feature_entities:
            feature_entities[feature_name] = []
        adding_entity = [0, 0, 0, product_reviews.iloc[review_num][2]]
        for meta_feature_entity in current_entities[feature_name]:
            adding_entity[0] += meta_feature_entity[0]
            adding_entity[1] += meta_feature_entity[1]
            adding_entity[2] += meta_feature_entity[2]
            
        adding_entity[0] = adding_entity[0] / len(current_entities[feature_name])
        adding_entity[1] = adding_entity[1] / len(current_entities[feature_name])

        feature_entities[feature_name].append(adding_entity)

In [0]:
#@markdown # correlation analysis

from sklearn.metrics import matthews_corrcoef
from scipy.stats import *
correlation_method = "Pearson" #@param ["Pearson", "Spearman"]
        
from scipy import stats

features_average = []

for feature_name in feature_entities:
    positive_sum = 0
    positive_count = 0
    negative_sum = 0
    negative_count = 0
    magnitude_sum = 0
    magnitude_count = 0
    scores = []
    ratings = []
    
    for feature_entity in feature_entities[feature_name]:
        if feature_entity[1] > 0:
            positive_sum += (feature_entity[1] * feature_entity[0])
            positive_count += 1
        elif feature_entity[1] < 0:
            negative_sum += (-feature_entity[1] * feature_entity[0])
            negative_count += 1
        magnitude_sum += feature_entity[2] * feature_entity[0]
        magnitude_count += 1
        
        scores.append(feature_entity[1] * feature_entity[0])
#         scores.append(feature_entity[1])
        
        ratings.append(feature_entity[3])
        
        if correlation_method == "Pearson":
          correlation = pearsonr(scores, ratings)
        else:
          correlation = spearmanr(scores, ratings)
        
    features_average.append([
                             0 if positive_count == 0 else positive_sum/positive_count,
                             0 if negative_count == 0 else negative_sum/negative_count,
                             0 if magnitude_count == 0 else magnitude_sum/magnitude_count,
                             correlation,
                             len(scores),
                             feature_name
                            ])
    
print("feature name".ljust(30, ' '), "coefficient".ljust(30, ' '), "p value".ljust(30, ' '), "evaluation sample size".ljust(30, ' '))
print()
for fa in features_average:
  print(str(fa[5]).ljust(30, ' '), str("{:.20f}".format(fa[3][0])).ljust(30, ' '), str("{:.20f}".format(fa[3][1])).ljust(30, ' '), str(fa[4]).ljust(30, ' '))
print()