In [None]:
import sys
sys.path.append("..")

In [None]:
# import default module
import os
import time
import math 
import pickle
import pandas as pd
import numpy as np
import tqdm

# import self module
import utils
import liwc_data
import matplotlib.pyplot as plt
import google_reviews

In [None]:
# define a shorthand for loading a pickle file
def read_pickle(path):
    if path[-4:] != '.pkl':
        raise ValueError
        
    with open(path, 'rb') as fd:
        f = pickle.load(fd)
    return f

In [None]:
# origin_data = read_pickle("../../cleaned_data/tainan/1010 Hunan Cuisine.pkl")
# ws_data = read_pickle("../../cleaned_data/tainan/word_segments/1010 Hunan Cuisine.pkl")

In [None]:
# categories = []
# for words in ws_data['ws']:
#     category = set()
#     for word in words:
#         if word not in four_aspect:
#             continue
#         category.update(four_aspect[word])
#     categories.append(category)

In [None]:
dictionaries = [
    '../liwc_dictionary/LIWC2015.txt',
    '../liwc_dictionary/new_add_liwc_dic.txt',
]

In [None]:
liwc_dic = liwc_data.LingusticsInquiry()
liwc_dic.add_dictionaries(*dictionaries)

In [None]:
def get_review_scores(words, return_details=False):
    
    categories = [liwc_dic.get_categories(word) for word in words]
    scores = np.r_[[get_score(category) for category in categories]]
    
    # assume a string containing space only is a delimiter of sentences
    sentence_masks = np.r_[[len(word.strip()) == 0 for word in words]]
    sentence_splits = np.where(sentence_masks)[0]
    
    key = 'negate (Negations)'
    negate_masks = np.r_[[key in category for category in categories]]

    # make both value is zero and is negate to -1
    scores[(scores == 0) & negate_masks] = -1
    
    inpaint_score_masks = [
        inpaint_scores(i) for i in np.split(scores, sentence_splits) if len(i) != 0]
    inpaint_score_masks = np.hstack(inpaint_score_masks)
    
    labels = negate_masks | (scores != 0) | inpaint_score_masks
    score_splits = utils.label_change(labels)
    
    splits = np.hstack([sentence_splits, score_splits])
    splits = np.unique(splits)
    
    review_scores = [
        get_segment_score(segment_scores)
        for segment_scores in np.split(scores, splits)]
    
    if return_details:
        arr = np.vstack([words, scores]).T
        return review_scores, (arr, sentence_splits, splits)

    return review_scores

def get_review_score(words, return_details=False):
    scores, details = get_review_scores(words, return_details=True)
    score = np.sum(scores)
    if return_details:
        return score, details
    return score

def review_scores_based_on_sentence(words):
    scores, (arr, sentence_splits, segment_splits) = \
        get_review_scores(words, return_details=True)
    I = np.searchsorted(segment_splits, sentence_splits) + 1
    sentence_scores = [np.sum(i) for i in np.split(scores, I)]
    sentence_scores = [min(max(i, -1), 1) for i in sentence_scores]
    return sentence_scores

def score_to_star(numerator, denominator):
    if numerator == 0:
        return 2 * numerator + 3
    return 2 * numerator / denominator + 3

def get_score(categories):
    # posemo (Positive Emotions) and negemo (Negative Emotions) can exist as the same time
    _score_table = {
        'posemo (Positive Emotions)': 1,
        'negemo (Negative Emotions)': -1
    }
    
    categories = set(categories)
    scores = [_score_table.get(category, 0) for category in categories]
    return 0 + sum(scores)

# expand scores
def inpaint_scores(nums):
    # make [1, 0, 1] -> [?, True, ?]
    return utils.sliding_mean(np.abs(nums), 1) > 0.6

# scores, sentence splits, segment splits(including sentence splits)
def get_segment_score(scores):
    if np.all(scores == 0):
        return 0
    
    scores[scores==0] = 1
    return np.prod(scores)

In [None]:
# senti_star = []
# for n, (reviews, star) in enumerate(zip(ws_data["ws"], ws_data['data'].stars)):
#     scores = review_scores_based_on_sentence(reviews)
#     numerator = np.sum(scores)
#     denominator = len(np.nonzero(scores)[0])
#     estimated_star = round(score_to_star(numerator, denominator), 2)
#     senti_star.append(estimated_star)   

In [None]:
# category = []
# scores = [np.sum(review_scores_based_on_sentence(words)) for words in ws_data['ws']]
# for score in scores:
#     if score >= 1:
#         category.append('postive')
#     if score <= -1:
#         category.append('negative')
#     if score == 0:
#         category.append('neutral')

In [None]:
dic = pd.read_excel("../../dic.xlsx")

In [None]:
# s = pd.read_excel("../../dic.xlsx", sheet_name="label")

In [None]:
four_aspect = {}

for word, category in zip(dic['word'], dic['category']):
    x = four_aspect.get(word, set())
    x.add(category)
    four_aspect[word] = x

In [None]:
# four_aspect = dic.set_index(['word'])["category"].to_dict()

In [None]:
# df = pd.DataFrame()
# df['time'] = origin_data.strftime()
# df['star'] = origin_data.stars
# df['username'] = origin_data.usernames
# df['review'] = origin_data.reviews

In [None]:
# df['senti_star'] = np.nan
# df['category'] = np.nan

In [None]:
# df['food'] = np.nan
# df['service'] = np.nan
# df['atmosphere'] = np.nan
# df['value'] = np.nan

In [None]:
# a = ws_data['data']
# index = np.searchsorted(origin_data.index, a.index)

In [None]:
# df['senti_star'].loc[index] = senti_star
# df['category'].loc[index] = category

In [None]:
# df['food'].loc[index] = [1 if 1 in i else np.nan for i in categories]
# df['service'].loc[index] = [1 if 3 in i else np.nan for i in categories]
# df['atmosphere'].loc[index] = [1 if 2 in i else np.nan for i in categories]
# df['value'].loc[index] = [1 if 4 in i else np.nan for i in categories]

In [None]:
origin_path = '../../cleaned_data/taipei/'
ws_path = '../../cleaned_data/taipei/word_segments/'

In [None]:
total = pd.DataFrame()
for filename in tqdm.tqdm(os.listdir(origin_path)):
    if not filename.endswith('.pkl'):
        continue
    origin_data = read_pickle(os.path.join(origin_path, filename))
    ws_data = read_pickle(os.path.join(ws_path, filename))
    a = ws_data['data']
    index = np.searchsorted(origin_data.index, a.index)
    
    senti_star = []
    for n, (reviews, star) in enumerate(zip(ws_data["ws"], ws_data['data'].stars)):
        scores = review_scores_based_on_sentence(reviews)
        numerator = np.sum(scores)
        denominator = len(np.nonzero(scores)[0])
        estimated_star = round(score_to_star(numerator, denominator), 2)
        senti_star.append(estimated_star)   
        
    category = []
    scores = [np.sum(review_scores_based_on_sentence(words)) for words in ws_data['ws']]
    for score in scores:
        if score >= 1:
            category.append('positive')
        if score <= -1:
            category.append('negative')
        if score == 0:
            category.append('neutral')

    categories = []
    for words in ws_data['ws']:
        cat = set()
        for word in words:
            if word not in four_aspect:
                continue
            cat.update(four_aspect[word])
        categories.append(cat)
    
    df = pd.DataFrame()
    df['time'] = origin_data.strftime()
    df['star'] = origin_data.stars
    df['username'] = origin_data.usernames
    df['review'] = origin_data.reviews
    df['senti_star'] = np.nan
    df['category'] = 'no_review'
    df['senti_star'][index] = senti_star
    df['category'][index] = category
    df['food'] = np.nan
    df['service'] = np.nan
    df['atmosphere'] = np.nan
    df['value'] = np.nan
    df['food'].loc[index] = [1 if 1 in i else np.nan for i in categories]
    df['service'].loc[index] = [1 if 3 in i else np.nan for i in categories]
    df['atmosphere'].loc[index] = [1 if 2 in i else np.nan for i in categories]
    df['value'].loc[index] = [1 if 4 in i else np.nan for i in categories]
    df['filename'] = filename.replace(".pkl", "")
    total = total.append(df)

In [None]:
total = total[["filename", "time", "star", "username", "review", 'senti_star', 'category', 
               'food', 'service', 'atmosphere', 'value']]
total

In [None]:
# total.to_csv(os.path.join('../../Google_review_code/result/review_with_sentiments/', 
#                           'taipei_review.csv'), index=False)