# 협업 필터링(Collaborative Filtering)

Surprise 패키지를 활용한 협업 필터링

In [1]:
import pandas as pd

c:\Users\82107\anaconda3\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\82107\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


## 데이터 불러오기

In [7]:
table_df = pd.read_csv('../../data/ulsan_attraction_table.csv')
table_df

Unnamed: 0,place_id,u_id,user_id,score,comment,p_id
0,가지산 입석대,0,김호영(황소바위),5,,248
1,가지산 입석대,1,기회란,4,,248
2,가지산 입석대,2,손영진(산동무),5,,248
3,가지산,0,김호영(황소바위),5,,16
4,가지산,3,여름햇살,5,계단과 가파른 길이 많아 힘은 들지만 정상에서 바라보는 영남알프스의 풍경 값으로는 ...,16
...,...,...,...,...,...,...
15400,해파랑길 4코스,8870,👏👏👏,5,중간중간 길 표시가 잘안되어있어요ㅠㅠ 동해 해안가 따라 걷는 길 좋아요,130
15401,해파랑길 7코스,8877,김봉수,3,"해파랑길7코스, 상행길의 마지막부분인 아신길 코스가 있다. 아산길은 자전거길과 도보...",124
15402,해파랑길 8코스,8877,김봉수,5,울산지역 해파랑길 코스중 가장 좋은 코스입니다. 특히 염포산의 벗꽃은 정말 매력적입...,156
15403,해파랑길 9코스,8877,김봉수,4,현대중공업 인근의 도심과 봉대산을 거쳐 해안가를 걷는 코스입니다. 울산지역은 이정표...,199


## 데이터 전처리

In [8]:
print(table_df.isnull().sum())

table_df.dropna(axis=0, how='any', subset=['u_id', 'p_id'], inplace=True)

place_id       0
u_id           0
user_id        0
score          0
comment     4857
p_id           0
dtype: int64


In [9]:
table_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15405 entries, 0 to 15404
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   place_id  15405 non-null  object
 1   u_id      15405 non-null  int64 
 2   user_id   15405 non-null  object
 3   score     15405 non-null  int64 
 4   comment   10548 non-null  object
 5   p_id      15405 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 722.2+ KB


In [5]:
# CountStatus = pd.value_counts(review_df['user_id'], sort=True)
# CountStatus.plot.bar()

In [10]:
data_df = table_df[['p_id', 'u_id', 'score']]
data_df.tail(10)

Unnamed: 0,p_id,u_id,score
15395,308,7877,5
15396,81,8065,1
15397,203,8258,5
15398,403,8502,2
15399,223,8869,5
15400,130,8870,5
15401,124,8877,3
15402,156,8877,5
15403,199,8877,4
15404,288,8878,4


In [7]:
# import numpy as np

# n_places = np.max(data_df['p_id'])
# n_users = np.max(data_df['u_id'])
# shape = (n_users+1, n_places+1)
# shape

In [8]:
# adj_matrix = np.ndarray(shape, dtype=int)
# for _, row in data_df.iterrows():
#     adj_matrix[row['u_id']][row['p_id']] = row['score']

# adj_matrix

## Surprise 라이브러리 활용

In [11]:
from surprise import KNNWithMeans, SVD, SVDpp, NMF
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import cross_validate, train_test_split

In [12]:
reader = Reader(rating_scale=(0.0, 5.0))
data = Dataset.load_from_df(data_df[['u_id', 'p_id', 'score']], reader)

In [13]:
train, test = train_test_split(data, test_size=0.2, shuffle=True, random_state=42)

## Hit Rate 계산 함수

일반적인 hit_rate 계산과는 다르게 User의 평가가 3.5이상인 Item만 Hit 판단

In [14]:
def calc_hit(model, u_id_list, data_df, top_n):
    hit_list = []
    for u_id in u_id_list:
        hit_count = 0
        p_id_list = list(set(data_df['p_id'][data_df['u_id']==u_id]))
        pred_df = pd.DataFrame(columns=['u_id', 'p_id', 'score'])
        for p_id in p_id_list:
            pred = model.predict(u_id, p_id, None)
            new_data = pd.DataFrame({'u_id': [pred.uid], 'p_id': [pred.iid], 'score': [pred.est]})
            pred_df = pd.concat([pred_df, new_data], axis=0, names=['u_id', 'p_id', 'rating'], ignore_index=True)
            recom_df = pred_df.sort_values(by=['score'], ascending=False)[:top_n]
            actual_df = data_df[data_df.u_id == u_id].sort_values(by=['score'], ascending=False)
            actual_df = actual_df[actual_df.score>=3.5][:top_n]
            
            
        for item in set(recom_df['p_id']):
            if int(item) in list(set(actual_df['p_id'])):
                hit_count += 1
                    
        hit_list.append(hit_count/top_n)
    return sum(hit_list)/len(u_id_list)

## GridSearch 활용한 최적 HP 탐색

### KNN

In [15]:
from surprise.model_selection import GridSearchCV

params = {'k': range(5, 30, 5),	# 이웃 크기
          'sim_options': {'name' : ['cosine']}}

gs = GridSearchCV(KNNWithMeans, params, measures=['rmse'], cv=5)
gs.fit(data)


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

In [16]:
print('최적의 조합', gs.best_params['rmse'])
print('최적의 조합의 RMSE', gs.best_score['rmse'])

최적의 조합 {'k': 15, 'sim_options': {'name': 'cosine', 'user_based': True}}
최적의 조합의 RMSE 0.9557500433159027


In [17]:
model = KNNWithMeans(k=15)
model.fit(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x2d6d92549a0>

In [18]:
pred = model.test(test)
accuracy.rmse(pred)

RMSE: 0.9759


0.9758727995896452

In [20]:
uid_list = [1263, 53, 4282, 8349, 5484, 5231, 6823, 7130, 4422, 3723]

hit_rate = calc_hit(model, uid_list, data_df, 10)
print(hit_rate)

0.13


### SVD

In [21]:
from surprise.model_selection import GridSearchCV
params = {'n_epochs': [20, 40, 60, 100],	# 연산 반복 횟수
          'lr_all': [0.005, 0.008, 0.001],	# 학습률
          'n_factors': [5, 10, 30, 50, 100], # 잠재요인
}

gs = GridSearchCV(SVD, params, measures=['rmse'], cv=5)
gs.fit(data)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.8708396658086507
{'n_epochs': 100, 'lr_all': 0.008, 'n_factors': 100}


In [22]:
model = SVD(n_epochs=100, lr_all=0.008, n_factors=100)
model.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2d6d9d2f550>

In [23]:
pred = model.test(test)
accuracy.rmse(pred)

RMSE: 0.8872


0.8872441800499213

In [24]:
uid_list = [1263, 53, 4282, 8349, 5484, 5231, 6823, 7130, 4422, 3723]

hit_rate = calc_hit(model, uid_list, data_df, 10)
print(hit_rate)

0.13


### SVD++

In [34]:
params = {'n_epochs': [20, 40, 60, 100],	# 연산 반복 횟수
          'lr_all': [0.005, 0.008, 0.001],	# 학습률
          'n_factors': [5, 10, 30, 50, 100], # 잠재요인
}

gs = GridSearchCV(SVDpp, params, measures=['rmse', 'mae'], cv=5)
gs.fit(data)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.8833816387119775
{'n_epochs': 100, 'lr_all': 0.008, 'n_factors': 30}


In [35]:
model = SVDpp(n_epochs=100, lr_all=0.008, n_factors=30)
model.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x2d6da0f2c40>

In [36]:
pred = model.test(test)
accuracy.rmse(pred)

RMSE: 0.9032


0.9032079843010333

In [37]:
uid_list = [1263, 53, 4282, 8349, 5484, 5231, 6823, 7130, 4422, 3723]

hit_rate = calc_hit(model, uid_list, data_df, 10)
print(hit_rate)

0.13


### NMF(Non-negative matrix factorization)

In [38]:
params = {'n_epochs': [20, 40, 60, 100],	# 연산 반복 횟수
          'n_factors': [5, 10, 30, 50, 100], # 잠재요인
}

gs = GridSearchCV(NMF, params, measures=['rmse', 'mae'], cv=5)
gs.fit(data)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.9428859556252357
{'n_epochs': 60, 'n_factors': 50}


In [39]:
model = NMF(n_epochs=60, n_factors=50)
model.fit(train)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x2d6d9d2f940>

In [40]:
pred = model.test(test)
accuracy.rmse(pred)

RMSE: 0.9702


0.9702104846185261

In [41]:
uid_list = [1263, 53, 4282, 8349, 5484, 5231, 6823, 7130, 4422, 3723]

hit_rate = calc_hit(model, uid_list, data_df, 10)
print(hit_rate)

0.13
