# Hybrid Recommandation System 

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
import tensorflow as tf
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate, BatchNormalization, Dropout
from keras.models import Model
from keras.optimizers import Adam

2023-03-29 11:46:56.456556: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-29 11:46:56.612475: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


## 데이터 불러오기

In [4]:
table_df = pd.read_csv('../data/ulsan_rest_table_ver3.csv')
table_df

Unnamed: 0,p_id,place_id,u_id,user_id,score,comment
0,0,225토마토스트릿,24506,힘내라힘,2,분위기에 비해 맛은 쏘쏘...
1,0,225토마토스트릿,24493,히둥이,5,
2,0,225토마토스트릿,24328,황영하,5,파스타 너무 맛있게 잘 먹었어요 태화동에 맛집이 별로 없어서 아쉬웠는데 맛집을 발견...
3,0,225토마토스트릿,24247,황규현,5,스테이크 부위가 바뀐것같은데 바뀐고기가 훨씬 좋은것 같아요..!.!!! 육향도 좋고...
4,0,225토마토스트릿,24022,호두과자,5,
...,...,...,...,...,...,...
54495,491,효정밥상,497,bbui bbui,4,가성비 좋은 간장게장 집. 비록 가격이 꾸준히 상승하고있긴 하지만 부담스럽진 않은 ...
54496,491,효정밥상,447,b suwan,5,가성비최고 간정게장집이라고 생각해요. 게장 직접담구는 모습도 볼수있고 직접담그는 만...
54497,491,효정밥상,328,Alex Ha,5,가성비는 대박입니다. 솔직히 맛도 좋음. (간장게장 기준)\n\n좀만 더 깨끗한 ...
54498,491,효정밥상,149,0o0o,5,여긴 진짜 간장게장에 진리다 사장님 이제 포장 안해주셔요 참고하세요 진짜 jmt


## 데이터 전처리

In [5]:
print(table_df.isnull().sum())
table_df.drop(columns='comment', inplace=True)
table_df.dropna(axis=0, how='any', inplace=True)

p_id            0
place_id        0
u_id            0
user_id         0
score           0
comment     14275
dtype: int64


In [6]:
table_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54500 entries, 0 to 54499
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   p_id      54500 non-null  int64 
 1   place_id  54500 non-null  object
 2   u_id      54500 non-null  int64 
 3   user_id   54500 non-null  object
 4   score     54500 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 2.1+ MB


In [7]:
data_df = table_df[['u_id', 'p_id', 'score']]
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54500 entries, 0 to 54499
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   u_id    54500 non-null  int64
 1   p_id    54500 non-null  int64
 2   score   54500 non-null  int64
dtypes: int64(3)
memory usage: 1.2 MB


In [8]:
data_df.p_id.nunique()

492

In [9]:
data_df.u_id.value_counts()[10:20]

9433     40
15086    39
23120    39
5484     37
10005    36
4422     36
330      32
13875    32
12142    31
6587     31
Name: u_id, dtype: int64

In [11]:
uid_list = list(data_df.u_id.value_counts()[10:20])

In [15]:
uid_list = [10556, 53, 13875, 8349, 5484, 9601, 6823, 16187, 4422, 12681]
uid_list = np.array(uid_list)

In [16]:
pid_list = np.array(data_df.u_id.unique())

## 협업필터링

In [14]:
loaded_model = tf.keras.models.load_model("./model/MLP.h5")

2023-03-29 11:46:59.637315: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-29 11:47:00.326507: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46695 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:af:00.0, compute capability: 8.6


## 컨텐츠기반필터링

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
item_Table = pd.read_csv('../data/kmeans_item_Table.csv')
item_Matrix = item_Table.groupby('p_id').mean()
item_Matrix = item_Matrix.to_numpy()
similarity = cosine_similarity(item_Matrix, item_Matrix)
# print('코사인 유사도 연산 결과 :',similarity.shape)

In [18]:
def get_recommendations(p_id, sim=similarity):
    sim_scores = list(enumerate(sim[p_id]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:4]

    return sim_scores

## 협업필터링 Hit Rate

### Hit Rate 계산 함수

In [19]:
def calc_hit(model, uid_list, data_df, top_n):
    hit = []
    # pid_list = np.array(data_df.p_id.unique())
    for u_id in uid_list:
        p_id_list = np.array(list(set(data_df['p_id'][data_df['u_id']==u_id])))
        pred_df = pd.DataFrame(columns=['u_id', 'p_id', 'score'])
        actual_df = data_df[data_df.u_id == u_id].sort_values(by=['score'], ascending=False)
        actual_df = actual_df[actual_df.score>=2.5][:top_n]
            
        for p_id in p_id_list:
            pred = model.predict([np.array([u_id]), np.array([p_id])])
            new_data = pd.DataFrame({'u_id': [u_id], 'p_id': [p_id], 'score': [pred]})
            pred_df = pd.concat([pred_df, new_data], axis=0, names=['u_id', 'p_id', 'rating'], ignore_index=True)
        
        recom_df = pred_df.sort_values(by=['score'], ascending=False)[:top_n]
        for item in set(recom_df['p_id']):
            if int(item) in list(set(actual_df['p_id'])):
                hit.append(1)
            else:
                hit.append(0)
    return np.mean(hit)

### Hit Rate

In [20]:
hit_rate = calc_hit(loaded_model, uid_list, data_df, 10)



2023-03-29 11:47:15.403555: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.




In [21]:
hit_rate

0.48

## 하이브리드 추천시스템 Hit Rate

### Hit Rate 계산 함수

기존 협업필터링 값에 컨텐츠기반필터링에서 추천되어지는 목록에 포함되는 장소가 있다면 가중치를 더해줌

In [36]:
def hy_calc_hit(model, uid_list, data_df, top_n, cbf_p_id_candidate):
    hit = []
    # pid_list = np.array(data_df.p_id.unique())
    for u_id in uid_list:
        pred_df = pd.DataFrame(columns=['u_id', 'p_id', 'score'])
        p_id_list = np.array(list(set(data_df['p_id'][data_df['u_id']==u_id])))
        
        actual_df = data_df[data_df.u_id == u_id].sort_values(by=['score'], ascending=False)
        actual_df = actual_df[actual_df.score>=2.5][:top_n]
            
        for p_id in p_id_list:
            pred = model.predict([np.array([u_id]), np.array([p_id])]) #협업필터링
            if p_id in cbf_p_id_candidate: #컨텐츠기반필터링 추천 목록에 포함된다면..
                pred = pred + pred*0.05     # 가중치 연산
            new_data = pd.DataFrame({'u_id': [u_id], 'p_id': [p_id], 'score': [pred]})
            pred_df = pd.concat([pred_df, new_data], axis=0, names=['u_id', 'p_id', 'rating'], ignore_index=True)
        
        recom_df = pred_df.sort_values(by=['score'], ascending=False)[:top_n]    
        for item in set(recom_df['p_id']):
            if int(item) in list(set(actual_df['p_id'])):
                hit.append(1)
            else:
                hit.append(0)
    return np.mean(hit)

### 컨텐츠기반필터링

유저의 기존 방문 목록을 기반으로 컨텐츠기반필터링

In [22]:
for u_id in uid_list:
    cbf_p_id_list = list(data_df[(data_df.u_id==u_id) & (data_df.p_id > 150)].p_id)
    cbf_p_id_candidate = []
    for cbf_p_id in cbf_p_id_list:
        for item in get_recommendations(cbf_p_id):
            cbf_p_id_candidate.append(item[0])
cbf_p_id_candidate = set(cbf_p_id_candidate)

In [23]:
len(cbf_p_id_candidate)

98

### Hit Rate

In [37]:
hit_rate = hy_calc_hit(loaded_model, uid_list, data_df, 10, cbf_p_id_candidate)



In [38]:
hit_rate

0.5