In [None]:
import sys
sys.path.append("..")

In [None]:
import os
import csv
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import google_reviews
import utils

In [None]:
path = "../../cleaned_data/taichung/word_segments/"

In [None]:
# define function to read pickle
def read_pickle(path):
    if path[-4:] != '.pkl':
        raise ValueError
    with open(path, 'rb') as fd:
        f = pickle.load(fd)
    return f

In [None]:
#define fuction to choose words by pos 
def clear(ckip):
    ckip_ = ckip.copy()
    pos_dic = {"Na", "Nb"}
    wss = []
    poss = []
    for ws, pos in zip(ckip_["ws"], ckip_["pos"]):
        wss_ = []
        poss_ = []
        for i, j in zip(ws, pos):
            if j in pos_dic:
                wss_.append(i)
                poss_.append(j)
        wss.append(wss_)
        poss.append(poss_)
        
    ckip_["ws"] = wss
    ckip_["pos"] = poss
    return ckip_

In [None]:
# define function to union each word segment result for each shop
def union_ws_result(ckip):
    doc = []
    for ws in ckip["ws"]:
        for i in ws:
            doc.append(i)
    return doc

Get analysis data

In [None]:
ws_all = {}

# read all pickle files
for filename in os.listdir(path):
    if filename.endswith(".pkl"):
        ckip = read_pickle(os.path.join(path, filename))
        ckip_ = clear(ckip)
        #union result to dictionary
        result = union_ws_result(ckip_)
        ws_all[filename.replace(".pkl", "") ]= result

In [None]:
# f = os.path.join("../../Google_review_code/all_data/" + 'Tainan.pkl')
# with open(f, "wb") as fd:
#     pickle.dump(ws_all, fd)

In [None]:
#get top N keywords (with term frequency)
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
# #save term frequency to csv file
# output_folder = "../../Google_review_code/result/keywords/Tainan/"
# for key in ws_all:
#     common_words = get_top_n_words(ws_all[key], 20)
#     filename = pd.DataFrame(common_words, columns = ['keywords' , 'counts'])
#     filename.to_csv(os.path.join(output_folder, key+".csv"), index=False)

In [None]:
#change ws_all dictionary to new dictionary, values: from list of strings to string
new_dict = {}
for k, v in ws_all.items():
    v= " ".join(v)
    new_dict[k]=v

In [None]:
df = pd.DataFrame(new_dict.items(), columns=["restaurant", "keywords"])
df.set_index('restaurant', inplace = True)

In [None]:
df

In [None]:
df.shape

In [None]:
# arr = 1. * cv_mx.toarray()

In [None]:
# A = np.sum(arr[0] * arr, axis=-1) / np.sqrt(np.sum(arr[0] ** 2)) / np.sqrt(np.sum(arr ** 2, axis=-1))

In [None]:
# np.all(np.isclose(A, cosine_sim[0]))

In [None]:
# convert to TF matrix
cv = CountVectorizer()
# sum TF
cv_mx = cv.fit_transform(df["keywords"])
# create cosine similarity matrix
# cosine_sim = cosine_similarity(cv_mx, cv_mx)

adding additonal feature to one hot encoding

In [None]:
data = pd.read_csv('../../Google_review_code/result/review_with_sentiments/taichung_review.csv')
data

In [None]:
splits = utils.label_change(data['filename'])

In [None]:
filenames = []
stars = []
senti_stars = []
foods = []
services = []
atmospheres = []
values = []

for d in np.split(data, splits):
    filename = d['filename'].to_numpy()
    star = d['star'].to_numpy()
    senti_star = d['senti_star'].to_numpy()
    food = d['food'].to_numpy()
    service = d['service'].to_numpy()
    atmosphere = d['atmosphere'].to_numpy()
    value = d['value'].to_numpy()
    
    filenames.append(filename[0])
    stars.append(np.mean(star))
    
    M = ~np.isnan(senti_star)
    N = len(np.nonzero(M)[0])
    senti_stars.append(np.mean(senti_star[M]))
    foods.append(1. * np.sum(food[~np.isnan(food)]) / N)
    services.append(1. * np.sum(service[~np.isnan(service)]) / N)
    atmospheres.append(1. * np.sum(atmosphere[~np.isnan(atmosphere)]) / N)
    values.append(1. * np.sum(value[~np.isnan(value)]) / N)

In [None]:
M = {}

for i, j in enumerate(df.index):
    M[j] = i

In [None]:
for n, (i, j) in enumerate(zip(df.index, filenames)):
    print(n, i, j)

In [None]:
# I = [M[i] for i in filenames]
I = np.arange(len(filenames))

filenames = np.r_[filenames][I]
stars = np.r_[stars][I]
senti_stars = np.r_[senti_stars][I]
foods = np.r_[foods][I]
services = np.r_[services][I]
atmospheres = np.r_[atmospheres][I]
values = np.r_[values][I]

In [None]:
cv_mx_ = cv_mx.toarray()
cv_mx_ = cv_mx_.astype(np.float64)
cv_mx_

In [None]:
new_add = np.vstack([stars, senti_stars, foods, services, atmospheres, values]).T
cv_mx_ = np.hstack([cv_mx_, new_add])

In [None]:
cosine_sim = cosine_similarity(cv_mx_, cv_mx_)

---

In [None]:
cosine_sim

In [None]:
cosine_sim.shape

In [None]:
indices = pd.Series(df.index)

def recommendations(name, cosine_similarities = cosine_sim):
    recommended_restaurant = []
    # gettin the index of the hotel that matches the name
    idx = indices[indices == name].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

    # getting the indexes of the 5 most similar hotels except itself
    top_5_indexes = list(score_series.iloc[1:6].index)
    
    # populating the list with the names of the top 5 matching hotels
    for i in top_5_indexes:
        recommended_restaurant.append(list(df.index)[i])
        
    return recommended_restaurant

In [None]:
recommend = {}
for key in new_dict:
#     print (key)
    top_5 = recommendations(key)
    recommend[key]= top_5

In [None]:
recommend

In [None]:
top_N = pd.DataFrame(recommend)

In [None]:
top_N = top_N.T
top_N.columns=["top_1", "top_2", "top_3", "top_4", "top_5"]

In [None]:
output_folder2="../../Google_review_code/result/recommendations/"
top_N.to_csv(os.path.join(output_folder2,"taichung_recommend.csv"), index="店名")