In [1]:
import pandas as pd
import numpy as np
import base64
import re

from sklearn.metrics.pairwise import cosine_similarity

import random

In [5]:
def str_to_vec(str):
    return np.frombuffer(base64.b85decode(str), dtype = np.float32)
def vec_to_str(vec):
    return base64.b85encode(vec.tobytes()).decode()

In [8]:
f = open('txt/importance_vectors.txt', 'r')
importance_vectors = list(map(str_to_vec ,eval(f.read())))

In [10]:
f = open('txt/category_vectors.txt', 'r')
category_vectors = list(map(str_to_vec ,eval(f.read())))

In [12]:
def calc_user_iv(importance_ratings):
    sum_user_iv = importance_vectors[0] * importance_ratings[0]
    for i in range(1, len(importance_ratings)):
        sum_user_iv = sum_user_iv + importance_vectors[i] * importance_ratings[i]
    user_iv = sum_user_iv/sum(importance_ratings)
    return user_iv

In [14]:
def calc_user_cv(category_ratings):
    sum_user_cv = category_vectors[0] * category_ratings[0]
    for i in range(1, len(category_ratings)):
        sum_user_cv = sum_user_cv + category_vectors[i] * category_ratings[i]
    user_cv = sum_user_cv/sum(category_ratings)
    return user_cv

In [16]:
def calc_user_vector(importance_ratings, category_ratings):
    user_iv = calc_user_iv(importance_ratings)
    user_cv = calc_user_cv(category_ratings)
    user_vector = 0.5*user_iv + 0.5*user_cv
    return user_vector

In [19]:
place_df = pd.read_csv('data/places_ML.csv')

In [21]:
def read_place_vectors(place_df_vector):
    document_embedding_list = []        
    i = 0

    # 각 문서에 대해서
    for v_tostring in place_df_vector:
        document_embedding_list.append(str_to_vec(v_tostring))
    
    # 각 문서에 대한 문서 벡터 리스트를 리턴
    return document_embedding_list

In [24]:
def pick_places(user_vector):
    place_ids = []
    
    document_embedding_list = read_place_vectors(place_df['place_vector'])
    cosine_similarities = cosine_similarity([user_vector], document_embedding_list)

    # 입력된 장소와 리뷰(document embedding)가 유사한 장소 10개 선정.
    sim_scores = list(enumerate(cosine_similarities[0]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[0:10]
    
    # 가장 유사한 장소 10개의 인덱스
    place_indices = [i[0] for i in sim_scores]

    random_indices = random.sample(place_indices, 5)

    for i in random_indices:
        place_ids.append(int(place_df.loc[i, 'naver_id']))

    return place_ids

In [27]:
def test(importance_ratings, category_ratings):

    return pick_places(calc_user_vector(importance_ratings, category_ratings))

In [31]:
test([0,0,0,0,1],[1,3,2,3,2,1,0,0,0,1,2,3,2,1,2,3,2,1])

[19411645, 1110186348, 20597120, 20315029, 1434781920]