In [None]:
import sys
sys.path.append("..")

In [None]:
# import default module
import os
import time
import math 
import pickle
import pandas as pd
import numpy as np

# import self module
import utils
import liwc_data
import matplotlib.pyplot as plt

In [None]:
import tqdm

### preprocess

In [None]:
dictionaries = [
    '../liwc_dictionary/LIWC2015.txt',
    '../liwc_dictionary/new_add_liwc_dic.txt',
]

In [None]:
liwc_dic = liwc_data.LingusticsInquiry()
liwc_dic.add_dictionaries(*dictionaries)

In [None]:
# define a shorthand for loading a pickle file
def read_pickle(path):
    if path[-4:] != '.pkl':
        raise ValueError
        
    with open(path, 'rb') as fd:
        f = pickle.load(fd)
    return f

### sample data

In [None]:
# input data, dictionary with {ws, pos, ner}
test_data = read_pickle('../../cleaned_data/taichung/word_segments/Akuan Hot Pot.pkl')

In [None]:
# check_sets = {'posemo (Positive Emotions)', 'negemo (Negative Emotions)', 'negate (Negations)'}

# for wss in test_data['ws']:
#     for ws in wss:
#         categories = get_categories(ws.strip())
#         if len(categories.intersection(check_sets)) > 1:
#             print (ws)
#             print (categories.intersection(check_sets))

#### scoring a sentence

In [None]:
def get_score(categories):
    # posemo (Positive Emotions) and negemo (Negative Emotions) can exist as the same time
    _score_table = {
        'posemo (Positive Emotions)': 1,
        'negemo (Negative Emotions)': -1
    }
    
    categories = set(categories)
    scores = [_score_table.get(category, 0) for category in categories]
    return 0 + sum(scores)

# expand scores
def inpaint_scores(nums):
    # make [1, 0, 1] -> [?, True, ?]
    return utils.sliding_mean(np.abs(nums), 1) > 0.6

# scores, sentence splits, segment splits(including sentence splits)
def get_segment_score(scores):
    if np.all(scores == 0):
        return 0
    
    scores[scores==0] = 1
    return np.prod(scores)

In [None]:
words  = np.r_[test_data['ws'][0]]
categories = [liwc_dic.get_categories(word) for word in words]
scores = np.r_[[get_score(category) for category in categories]]

print (np.vstack([words, scores]))

In [None]:
# assume a string containing space only is a delimiter of sentences
sentence_masks = np.r_[[len(word.strip()) == 0 for word in words]]
sentence_splits = np.where(sentence_masks)[0]

In [None]:
key = 'negate (Negations)'
negate_masks = np.r_[[key in category for category in categories]]

# make both value is zero and is negate to -1
scores[(scores == 0) & negate_masks] = -1

In [None]:
inpaint_score_masks = [inpaint_scores(i) for i in np.split(scores, sentence_splits) if len(i) != 0]
inpaint_score_masks = np.hstack(inpaint_score_masks)

In [None]:
labels = negate_masks | (scores != 0) | inpaint_score_masks
score_splits = utils.label_change(labels)

In [None]:
splits = np.hstack([sentence_splits, score_splits])
splits = np.unique(splits)

In [None]:
# the sample result
arr = np.vstack([words, scores]).T
np.split(arr, splits)

In [None]:
review_score = [get_segment_score(segment_scores) for segment_scores in np.split(scores, splits)]
review_score = np.sum(review_score)
review_score

#### scoring all sentences

In [None]:
def get_review_scores(words, return_details=False):
    
    categories = [liwc_dic.get_categories(word) for word in words]
    scores = np.r_[[get_score(category) for category in categories]]
    
    # assume a string containing space only is a delimiter of sentences
    sentence_masks = np.r_[[len(word.strip()) == 0 for word in words]]
    sentence_splits = np.where(sentence_masks)[0]
    
    key = 'negate (Negations)'
    negate_masks = np.r_[[key in category for category in categories]]

    # make both value is zero and is negate to -1
    scores[(scores == 0) & negate_masks] = -1
    
    inpaint_score_masks = [
        inpaint_scores(i) for i in np.split(scores, sentence_splits) if len(i) != 0]
    inpaint_score_masks = np.hstack(inpaint_score_masks)
    
    labels = negate_masks | (scores != 0) | inpaint_score_masks
    score_splits = utils.label_change(labels)
    
    splits = np.hstack([sentence_splits, score_splits])
    splits = np.unique(splits)
    
    review_scores = [
        get_segment_score(segment_scores)
        for segment_scores in np.split(scores, splits)]
    
    if return_details:
        arr = np.vstack([words, scores]).T
        return review_scores, (arr, sentence_splits, splits)

    return review_scores

def get_review_score(words, return_details=False):
    scores, details = get_review_scores(words, return_details=True)
    score = np.sum(scores)
    if return_details:
        return score, details
    return score

def review_scores_based_on_sentence(words):
    scores, (arr, sentence_splits, segment_splits) = \
        get_review_scores(words, return_details=True)
    I = np.searchsorted(segment_splits, sentence_splits) + 1
    sentence_scores = [np.sum(i) for i in np.split(scores, I)]
    sentence_scores = [min(max(i, -1), 1) for i in sentence_scores]
    return sentence_scores

In [None]:
score = arr[:, 1]
scores = np.split(score, splits)
scores = [s.astype(np.int64) for s in scores]
len(np.nonzero(np.array([get_segment_score(s) for s in scores]) != 0)[0])

In [None]:
print ([get_review_score(words) for words in test_data['ws']])

In [None]:
print ([np.sum(review_scores_based_on_sentence(words)) for words in test_data['ws']])

In [None]:
score, details = get_review_scores(test_data['ws'][121], return_details=True)
print (score)
arr, splits, seg_splits = details
np.split(arr, splits)

In [None]:
score = arr[:, 1]
scores = np.split(score, splits)
scores = [s.astype(np.int64) for s in scores]
len(np.nonzero(np.array([get_segment_score(s) for s in scores]) != 0)[0])

In [None]:
arr = review_scores_based_on_sentence(test_data['ws'][0])
len(np.nonzero(np.array(arr) != 0)[0])

In [None]:
def score_to_star(numerator, denominator):
    if numerator == 0:
        return 2 * numerator + 3
    return 2 * numerator / denominator + 3

In [None]:
def check(reviews):
#     reviews = test_data['ws'][0]

    # get estimated star
    sentence_scores = review_scores_based_on_sentence(reviews)
    numerator = np.sum(sentence_scores)
    denominator = len(np.nonzero(sentence_scores)[0])
    estimated_star = score_to_star(numerator, denominator)

    def f(segment_arr):
        x = ['(%s, %s)' % (i, j) for i, j in segment_arr]
        x = ', '.join(x)
    #     x = '[' + x + ']'
        return x

    segment_scores, (arr, sentence_splits, segment_splits) = \
        get_review_scores(reviews, return_details=True)

    sentence_info = [f(i) for i in np.split(arr, sentence_splits)]
    return estimated_star, list(zip(sentence_scores, sentence_info))
    # arrs = np.split(arr, segment_splits)
    # segment_info = [f(i) for i in arrs]

    # I = np.searchsorted(segment_splits, sentence_splits) + 1
    # sentence_info = np.split(segment_info, I)
    # sentence_info = ['  '.join(i.tolist()) for i in sentence_info]
    # for i, j in zip(sentence_scores, sentence_info):
    #     print (i, j)

In [None]:
with open('./test_data', 'w') as fd:

    for n, (reviews, star, raw_review) in enumerate(
        zip(test_data["ws"], test_data['data'].stars, test_data['data'].reviews)):

        e_star, sentence_info = check(reviews)
        if e_star == star:
            continue
        info = '--- %d ---\n' % n
        info += raw_review + '\n'
        for i, j in sentence_info:
            info += '%d => %s\n' % (i, j)
        info += 'estima star: %f\n' % e_star
        info += 'origin star: %f\n\n' % star
        
        # write into file
        fd.write(info)

In [None]:
# A = []

# for n, (reviews, star) in enumerate(zip(test_data["ws"], test_data['data'].stars)):
    
#     # numerator
#     scores = review_scores_based_on_sentence(reviews)
    
#     numerator = np.sum(scores)
# #     print(numerator)
# #     print(scores)
#     denominator = len(np.nonzero(scores)[0])
#     estimated_star = score_to_star(numerator, denominator)
#     A.append(estimated_star)
    
#     if estimated_star != star:
    
#         print ('--- %d ---' % n)
#         print (reviews)
#         print (scores)
#         print (estimated_star, star)

In [None]:
test_data['data'].reviews[66]

In [None]:
get_review_scores(test_data['ws'][66], return_details=True)

In [None]:
I = np.argsort(A)
X = range(len(A))

In [None]:
plt.plot(X, np.array(A)[I], X, test_data['data'].stars[I], '.')

In [None]:
for filename in os.listdir(path):
    B = {}
    if filename.endswith(".pkl"):
        data = read_pickle(os.path.join(path, filename))
        A = []
        for n, (reviews, star) in enumerate(zip(data["ws"], data['data'].stars)):
            scores = review_scores_based_on_sentence(reviews)
            numerator = np.sum(scores)
            denominator = len(np.nonzero(scores)[0])
            estimated_star = round(score_to_star(numerator, denominator), 2)
            A.append(estimated_star)    
        B["senti_star"] = A
        B["time"] = data["data"].datetime
        B["username"] = data["data"].usernames
        B["origin_star"] = data["data"].stars
        B["review"] = data["data"].reviews
        
    df = pd.DataFrame.from_dict(B)
    #df = df[["time", "username", "origin_star", "senti_star", "review"]]
    df.to_csv(os.path.join("../../Google_review_code/result/sentiments/Tainan", filename +".csv"), index=False)

In [None]:
for filename in os.listdir(path):
    if filename.endswith(".pkl"):
        data = read_pickle(os.path.join(path, filename))
        A = []
        for n, (reviews, star) in enumerate(zip(data["ws"], data['data'].stars)):
            scores = review_scores_based_on_sentence(reviews)
            numerator = np.sum(scores)
            denominator = len(np.nonzero(scores)[0])
            estimated_star = score_to_star(numerator, denominator)
            A.append(estimated_star)
            if estimated_star != star:
                print ('--- %d ---' % n)
                print (reviews)
                print (scores)
                print (estimated_star, star)
            

In [None]:
# total = {}

# for filename in os.listdir(path):
#     three_type = {}
#     pos = []
#     neg = []
#     neu = []
#     if filename.endswith(".pkl"):
#         data = read_pickle(os.path.join(path, filename))
#         scores = [np.sum(review_scores_based_on_sentence(words)) for words in data['ws']]
#         for score in scores:
#             if score >= 1:
#                 pos.append(score)
#             if score <= -1:
#                 neg.append(score)
#             if score == 0:
#                 neu.append(score)
#         three_type["postive"] = len(pos)
#         three_type["negative"] = len(neg)
#         three_type["neutral"] = len(neu)
#         three_type["total"] = len(data["ws"])
#     filename = filename.replace(".pkl", "")
#     total[filename] = three_type

# df = pd.DataFrame.from_dict(total)
# df = df.T
# df.to_csv(os.path.join("../../Google_review_code/result/sentiments/", "tainan_sentiment.csv"))

In [None]:
path = "../../cleaned_data/taichung/word_segments/"

In [None]:
for filename in tqdm.tqdm(os.listdir(path)):
    
    if not filename.endswith(".pkl"):
        continue
    
    data = read_pickle(os.path.join(path, filename))
    senti_star = []
    for n, (reviews, star) in enumerate(zip(data["ws"], data['data'].stars)):
        scores = review_scores_based_on_sentence(reviews)
        numerator = np.sum(scores)
        denominator = len(np.nonzero(scores)[0])
        estimated_star = round(score_to_star(numerator, denominator), 2)
        senti_star.append(estimated_star)   
    
    category = []
    scores = [np.sum(review_scores_based_on_sentence(words)) for words in data['ws']]
    for score in scores:
        if score >= 1:
            category.append('postive')
        if score <= -1:
            category.append('negative')
        if score == 0:
            category.append('neutral')
    data = data['data']
    total = list(zip(senti_star, category, data.strftime(), data.usernames, data.stars, data.reviews))
    
    df = pd.DataFrame(total, columns=['senti_star', 'category', 'time', 'username', 'origin_star', 'review'])
    filename = filename.replace(".pkl", "")
    f = os.path.join("../../Google_review_code/result/sentiments/taichung", filename +".csv")
#     print(f)
    df.to_csv(f, index=False)

In [None]:
# R = []

# for filename in tqdm.tqdm(os.listdir(path)):
#     if not filename.endswith(".pkl"):
#         continue
    
#     data = read_pickle(os.path.join(path, filename))
    
#     A = []
#     for n, (reviews, star) in enumerate(zip(data["ws"], data['data'].stars)):
#         scores = review_scores_based_on_sentence(reviews)
#         numerator = np.sum(scores)
#         denominator = len(np.nonzero(scores)[0])
#         estimated_star = score_to_star(numerator, denominator)
#         A.append(estimated_star)
    
#     name =  filename.replace('.pkl', '')
#     senti_star = np.mean(A)
#     orgin_star = np.mean(data["data"].stars)
#     R.append((name, senti_star, orgin_star))

In [None]:
# A = pd.DataFrame(R, columns=['filename', 'senti_star', 'origin_star'])
# A['senti_star'] = np.round(A['senti_star'], decimals=2)
# A['origin_star'] = np.round(A['origin_star'], decimals=2)
# A.to_csv(os.path.join("../../Google_review_code/result/sentiments/", "tainan_senti_star.csv"), index=False)