# 컨텐츠 기반 추천시스템

아이템 평점 기반 추천

In [1]:
import matplotlib
import matplotlib.pyplot as plt
 
%config InlineBackend.figure_format = 'retina'
import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
fontprop = fm.FontProperties(fname=fontpath, size=8) 

In [2]:
#폰트종류설정
plt.rcParams["font.family"] = 'NanumGothic'

#폰트크기설정
#plt.rcParams["font.size"] = 20

## 데이터 전처리

In [3]:
import konlpy
import urllib.request
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')

from konlpy.tag import Mecab
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

  plt.style.use('seaborn-white')


In [4]:
import pandas as pd
import numpy as np
data_df = pd.read_csv('../../data/ulsan_attraction_table.csv')
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15405 entries, 0 to 15404
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   place_id  15405 non-null  object
 1   u_id      15405 non-null  int64 
 2   user_id   15405 non-null  object
 3   score     15405 non-null  int64 
 4   comment   10548 non-null  object
 5   p_id      15405 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 722.2+ KB


In [5]:
# data_df['comment'] = data_df['comment'].str.replace("[^ㄱ-하-ㅣ가-힣 ]", "")
# data_df['comment'].replace('', np.nan, inplace=True)
print(len(data_df))
print(data_df.isnull().sum())

15405
place_id       0
u_id           0
user_id        0
score          0
comment     4857
p_id           0
dtype: int64


In [6]:
# print(data_df.isnull().sum())

data_df = data_df.dropna(how='any')

In [7]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10548 entries, 4 to 15404
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   place_id  10548 non-null  object
 1   u_id      10548 non-null  int64 
 2   user_id   10548 non-null  object
 3   score     10548 non-null  int64 
 4   comment   10548 non-null  object
 5   p_id      10548 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 576.8+ KB


In [8]:
# data_df['comment'].replace('', np.nan, inplace=True)
print(len(data_df))
print(data_df.isnull().sum())

10548
place_id    0
u_id        0
user_id     0
score       0
comment     0
p_id        0
dtype: int64


## Item Matrix

In [9]:
n_users = data_df.u_id.max()
n_places = data_df.p_id.max()
shape = (n_places+1, n_users+1)
print(shape)

(490, 9100)


In [10]:
item_Matrix = np.ndarray(shape, dtype=float)
# item_Matrix
for p_id, u_id, rating in zip(data_df.p_id, data_df.u_id, data_df.score):
    item_Matrix[p_id][u_id] = rating
item_Matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Cosine Similarity

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
similarity = cosine_similarity(item_Matrix, item_Matrix)
print('코사인 유사도 연산 결과 :',similarity.shape)

코사인 유사도 연산 결과 : (490, 490)


In [13]:
def get_recommendations(p_id, sim=similarity):
    # 해당 아이템과 모든 아이템의 유사도를 가져온다.
    sim_scores = list(enumerate(sim[p_id]))

    # 유사도에 따라 아이템들을 정렬한다.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 가장 유사한 10개의 아이템을 받아온다.
    sim_scores = sim_scores[1:11]

    # 가장 유사한 10개의 아이템의 ID을 리턴한다.
    return sim_scores

In [14]:
get_recommendations(400)

[(235, 0.14294833797837264),
 (147, 0.04700901061054319),
 (116, 0.029159104504526554),
 (328, 0.019153292760879636),
 (27, 0.014881545566625427),
 (150, 0.014287347218712214),
 (6, 0.013885996274281054),
 (219, 0.012133920952907254),
 (0, 0.0),
 (1, 0.0)]

In [15]:
data_df[data_df.p_id == 400]

Unnamed: 0,place_id,u_id,user_id,score,comment,p_id
15016,다운동고분군,1541,생활다락방,1,여기가 왜 고분군인지 막상 보면 모르겠습니다. 흔적도 없이 밭만 무성한 그런 곳인데...,400
15017,다운동고분군,1542,세은,3,"저희집 근처인데...하.하.하 정말 그냥 산 인줄알았어요. 하하\r\n가끔 새소리,...",400
15018,다운동고분군,1543,김재화,1,관리가 되지않아서 보기싫어요,400
15019,다운동고분군,1544,강영송,4,옛것이 사라져가니 안타카울뿐이죠,400


In [16]:
data_df[data_df.u_id == 987]

Unnamed: 0,place_id,u_id,user_id,score,comment,p_id
730,강동화암주상절리,987,HoLlOlLoLlU,5,작은 주상절리가 맵댜 캠핑하기 좋은듯,44
