# Hybrid Recommandation System 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
import tensorflow as tf
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate, BatchNormalization, Dropout
from keras.models import Model
from keras.optimizers import Adam

## 데이터 불러오기

In [2]:
table_df = pd.read_csv('../../data/ulsan_attraction_table.csv')
table_df

Unnamed: 0,place_id,u_id,user_id,score,comment,p_id
0,가지산 입석대,0,김호영(황소바위),5,,248
1,가지산 입석대,1,기회란,4,,248
2,가지산 입석대,2,손영진(산동무),5,,248
3,가지산,0,김호영(황소바위),5,,16
4,가지산,3,여름햇살,5,계단과 가파른 길이 많아 힘은 들지만 정상에서 바라보는 영남알프스의 풍경 값으로는 ...,16
...,...,...,...,...,...,...
15400,해파랑길 4코스,8870,👏👏👏,5,중간중간 길 표시가 잘안되어있어요ㅠㅠ 동해 해안가 따라 걷는 길 좋아요,130
15401,해파랑길 7코스,8877,김봉수,3,"해파랑길7코스, 상행길의 마지막부분인 아신길 코스가 있다. 아산길은 자전거길과 도보...",124
15402,해파랑길 8코스,8877,김봉수,5,울산지역 해파랑길 코스중 가장 좋은 코스입니다. 특히 염포산의 벗꽃은 정말 매력적입...,156
15403,해파랑길 9코스,8877,김봉수,4,현대중공업 인근의 도심과 봉대산을 거쳐 해안가를 걷는 코스입니다. 울산지역은 이정표...,199


## 데이터 전처리

In [3]:
print(table_df.isnull().sum())
table_df.drop(columns='comment', inplace=True)
table_df.dropna(axis=0, how='any', inplace=True)

place_id       0
u_id           0
user_id        0
score          0
comment     4857
p_id           0
dtype: int64


In [4]:
table_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15405 entries, 0 to 15404
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   place_id  15405 non-null  object
 1   u_id      15405 non-null  int64 
 2   user_id   15405 non-null  object
 3   score     15405 non-null  int64 
 4   p_id      15405 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 601.9+ KB


In [5]:
data_df = table_df[['u_id', 'p_id', 'score']]
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15405 entries, 0 to 15404
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   u_id    15405 non-null  int64
 1   p_id    15405 non-null  int64
 2   score   15405 non-null  int64
dtypes: int64(3)
memory usage: 361.2 KB


In [8]:
data_df.p_id.nunique()

274

In [9]:
data_df.u_id.value_counts()[10:20]

37      16
8       15
455     15
1198    15
836     14
1146    14
892     14
1596    13
56      13
1137    13
Name: u_id, dtype: int64

In [11]:
uid_list = list(data_df.u_id.value_counts()[10:20])

In [12]:
uid_list = [1137, 56, 1596, 892, 1146, 836, 1198, 455, 8, 37]
uid_list = np.array(uid_list)

In [15]:
pid_list = np.array(data_df.u_id.unique())

## 협업필터링

In [16]:
loaded_model = tf.keras.models.load_model("../model/attraction_MLP.h5")

## 컨텐츠기반필터링

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
item_Table = pd.read_csv('../../data/ulsan_attraction_title.csv')
item_Matrix = item_Table.groupby('p_id').mean()
item_Matrix = item_Matrix.to_numpy()
similarity = cosine_similarity(item_Matrix, item_Matrix)
# print('코사인 유사도 연산 결과 :',similarity.shape)

In [18]:
def get_recommendations(p_id, sim=similarity):
    sim_scores = list(enumerate(sim[p_id]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:4]

    return sim_scores

## 협업필터링 Hit Rate

### Hit Rate 계산 함수

In [19]:
def calc_hit(model, uid_list, data_df, top_n):
    hit = []
    # pid_list = np.array(data_df.p_id.unique())
    for u_id in uid_list:
        p_id_list = np.array(list(set(data_df['p_id'][data_df['u_id']==u_id])))
        pred_df = pd.DataFrame(columns=['u_id', 'p_id', 'score'])
        actual_df = data_df[data_df.u_id == u_id].sort_values(by=['score'], ascending=False)
        actual_df = actual_df[actual_df.score>=2.5][:top_n]
            
        for p_id in p_id_list:
            pred = model.predict([np.array([u_id]), np.array([p_id])])
            new_data = pd.DataFrame({'u_id': [u_id], 'p_id': [p_id], 'score': [pred]})
            pred_df = pd.concat([pred_df, new_data], axis=0, names=['u_id', 'p_id', 'rating'], ignore_index=True)
        
        recom_df = pred_df.sort_values(by=['score'], ascending=False)[:top_n]
        for item in set(recom_df['p_id']):
            if int(item) in list(set(actual_df['p_id'])):
                hit.append(1)
            else:
                hit.append(0)
    return np.mean(hit)

### Hit Rate

In [20]:
hit_rate = calc_hit(loaded_model, uid_list, data_df, 10)



In [21]:
hit_rate

0.81

## 하이브리드 추천시스템 Hit Rate

### Hit Rate 계산 함수

기존 협업필터링 값에 컨텐츠기반필터링에서 추천되어지는 목록에 포함되는 장소가 있다면 가중치를 더해줌

In [22]:
def hy_calc_hit(model, uid_list, data_df, top_n, cbf_p_id_candidate):
    hit = []
    # pid_list = np.array(data_df.p_id.unique())
    for u_id in uid_list:
        pred_df = pd.DataFrame(columns=['u_id', 'p_id', 'score'])
        p_id_list = np.array(list(set(data_df['p_id'][data_df['u_id']==u_id])))
        
        actual_df = data_df[data_df.u_id == u_id].sort_values(by=['score'], ascending=False)
        actual_df = actual_df[actual_df.score>=2.5][:top_n]
            
        for p_id in p_id_list:
            pred = model.predict([np.array([u_id]), np.array([p_id])]) #협업필터링
            if p_id in cbf_p_id_candidate: #컨텐츠기반필터링 추천 목록에 포함된다면..
                pred = pred + pred*0.05     # 가중치 연산
            new_data = pd.DataFrame({'u_id': [u_id], 'p_id': [p_id], 'score': [pred]})
            pred_df = pd.concat([pred_df, new_data], axis=0, names=['u_id', 'p_id', 'rating'], ignore_index=True)
        
        recom_df = pred_df.sort_values(by=['score'], ascending=False)[:top_n]    
        for item in set(recom_df['p_id']):
            if int(item) in list(set(actual_df['p_id'])):
                hit.append(1)
            else:
                hit.append(0)
    return np.mean(hit)

### 컨텐츠기반필터링

유저의 기존 방문 목록을 기반으로 컨텐츠기반필터링

In [23]:
for u_id in uid_list:
    cbf_p_id_list = list(data_df[(data_df.u_id==u_id) & (data_df.p_id > 150)].p_id)
    cbf_p_id_candidate = []
    for cbf_p_id in cbf_p_id_list:
        for item in get_recommendations(cbf_p_id):
            cbf_p_id_candidate.append(item[0])
cbf_p_id_candidate = set(cbf_p_id_candidate)

In [24]:
len(cbf_p_id_candidate)

3

### Hit Rate

In [25]:
hit_rate = hy_calc_hit(loaded_model, uid_list, data_df, 10, cbf_p_id_candidate)



In [26]:
hit_rate

0.81