# 협업 필터링(Collaborative Filtering)

Surprise 패키지를 활용한 협업 필터링

In [1]:
import pandas as pd

## 데이터 불러오기

In [2]:
table_df = pd.read_csv('../data/ulsan_rest_table_ver3.csv')
table_df

Unnamed: 0,p_id,place_id,u_id,user_id,score,comment
0,0,225토마토스트릿,24506,힘내라힘,2,분위기에 비해 맛은 쏘쏘...
1,0,225토마토스트릿,24493,히둥이,5,
2,0,225토마토스트릿,24328,황영하,5,파스타 너무 맛있게 잘 먹었어요 태화동에 맛집이 별로 없어서 아쉬웠는데 맛집을 발견...
3,0,225토마토스트릿,24247,황규현,5,스테이크 부위가 바뀐것같은데 바뀐고기가 훨씬 좋은것 같아요..!.!!! 육향도 좋고...
4,0,225토마토스트릿,24022,호두과자,5,
...,...,...,...,...,...,...
54495,491,효정밥상,497,bbui bbui,4,가성비 좋은 간장게장 집. 비록 가격이 꾸준히 상승하고있긴 하지만 부담스럽진 않은 ...
54496,491,효정밥상,447,b suwan,5,가성비최고 간정게장집이라고 생각해요. 게장 직접담구는 모습도 볼수있고 직접담그는 만...
54497,491,효정밥상,328,Alex Ha,5,가성비는 대박입니다. 솔직히 맛도 좋음. (간장게장 기준)\n\n좀만 더 깨끗한 ...
54498,491,효정밥상,149,0o0o,5,여긴 진짜 간장게장에 진리다 사장님 이제 포장 안해주셔요 참고하세요 진짜 jmt


## 데이터 전처리

In [3]:
print(table_df.isnull().sum())

table_df.dropna(axis=0, how='any', subset=['u_id', 'p_id'], inplace=True)

p_id            0
place_id        0
u_id            0
user_id         0
score           0
comment     14275
dtype: int64


In [4]:
table_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54500 entries, 0 to 54499
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   p_id      54500 non-null  int64 
 1   place_id  54500 non-null  object
 2   u_id      54500 non-null  int64 
 3   user_id   54500 non-null  object
 4   score     54500 non-null  int64 
 5   comment   40225 non-null  object
dtypes: int64(3), object(3)
memory usage: 2.5+ MB


In [5]:
# CountStatus = pd.value_counts(review_df['user_id'], sort=True)
# CountStatus.plot.bar()

In [6]:
data_df = table_df[['p_id', 'u_id', 'score']]
data_df.tail(10)

Unnamed: 0,p_id,u_id,score
54490,491,764,4
54491,491,731,4
54492,491,704,1
54493,491,555,5
54494,491,549,4
54495,491,497,4
54496,491,447,5
54497,491,328,5
54498,491,149,5
54499,491,53,4


In [7]:
# import numpy as np

# n_places = np.max(data_df['p_id'])
# n_users = np.max(data_df['u_id'])
# shape = (n_users+1, n_places+1)
# shape

In [8]:
# adj_matrix = np.ndarray(shape, dtype=int)
# for _, row in data_df.iterrows():
#     adj_matrix[row['u_id']][row['p_id']] = row['score']

# adj_matrix

## Surprise 라이브러리 활용

In [9]:
from surprise import KNNWithMeans, SVD, SVDpp, NMF
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import cross_validate, train_test_split

In [10]:
reader = Reader(rating_scale=(0.0, 5.0))
data = Dataset.load_from_df(data_df[['u_id', 'p_id', 'score']], reader)

In [11]:
train, test = train_test_split(data, test_size=0.2, shuffle=True, random_state=42)

## Hit Rate 계산 함수

일반적인 hit_rate 계산과는 다르게 User의 평가가 3.5이상인 Item만 Hit 판단

In [103]:
def calc_hit(model, u_id_list, data_df, top_n):
    hit_list = []
    for u_id in u_id_list:
        hit_count = 0
        p_id_list = list(set(data_df['p_id'][data_df['u_id']==u_id]))
        pred_df = pd.DataFrame(columns=['u_id', 'p_id', 'score'])
        for p_id in p_id_list:
            pred = model.predict(u_id, p_id, None)
            new_data = pd.DataFrame({'u_id': [pred.uid], 'p_id': [pred.iid], 'score': [pred.est]})
            pred_df = pd.concat([pred_df, new_data], axis=0, names=['u_id', 'p_id', 'rating'], ignore_index=True)
            recom_df = pred_df.sort_values(by=['score'], ascending=False)[:top_n]
            actual_df = data_df[data_df.u_id == u_id].sort_values(by=['score'], ascending=False)
            actual_df = actual_df[actual_df.score>=3.5][:top_n]
            
            
        for item in set(recom_df['p_id']):
            if int(item) in list(set(actual_df['p_id'])):
                hit_count += 1
                    
        hit_list.append(hit_count/top_n)
    return sum(hit_list)/len(u_id_list)

## GridSearch 활용한 최적 HP 탐색

### KNN

In [12]:
from surprise.model_selection import GridSearchCV

params = {'k': range(5, 30, 5),	# 이웃 크기
          'sim_options': {'name' : ['cosine']}}

gs = GridSearchCV(KNNWithMeans, params, measures=['rmse'], cv=5)
gs.fit(data)


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

In [13]:
print('최적의 조합', gs.best_params['rmse'])
print('최적의 조합의 RMSE', gs.best_score['rmse'])

최적의 조합 {'k': 25, 'sim_options': {'name': 'cosine', 'user_based': True}}
최적의 조합의 RMSE 1.1644049210178153


In [104]:
model = KNNWithMeans(k=25)
model.fit(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f2cd14d8b20>

In [105]:
pred = model.test(test)
accuracy.rmse(pred)

RMSE: 1.1443


1.144272038506518

In [106]:
uid_list = [10556, 53, 13875, 8349, 5484, 9601, 6823, 16187, 4422, 12681]

hit_rate = calc_hit(model, uid_list, data_df, 10)
print(hit_rate)

0.37


### SVD

In [12]:
from surprise.model_selection import GridSearchCV
params = {'n_epochs': [20, 40, 60, 100],	# 연산 반복 횟수
          'lr_all': [0.005, 0.008, 0.001],	# 학습률
          'n_factors': [5, 10, 30, 50, 100], # 잠재요인
}

gs = GridSearchCV(SVD, params, measures=['rmse'], cv=5)
gs.fit(data)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

1.0419677036015726
{'n_epochs': 100, 'lr_all': 0.008, 'n_factors': 100}


In [107]:
model = SVD(n_epochs=100, lr_all=0.008, n_factors=100)
model.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2cd20f9b20>

In [108]:
pred = model.test(test)
accuracy.rmse(pred)

RMSE: 1.0263


1.0263439765111317

In [109]:
uid_list = [10556, 53, 13875, 8349, 5484, 9601, 6823, 16187, 4422, 12681]

hit_rate = calc_hit(model, uid_list, data_df, 10)
print(hit_rate)

0.44000000000000006


### SVD++

In [19]:
params = {'n_epochs': [20, 40, 60, 100],	# 연산 반복 횟수
          'lr_all': [0.005, 0.008, 0.001],	# 학습률
          'n_factors': [5, 10, 30, 50, 100], # 잠재요인
}

gs = GridSearchCV(SVDpp, params, measures=['rmse', 'mae'], cv=5)
gs.fit(data)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

1.047925049183156
{'n_epochs': 100, 'lr_all': 0.008, 'n_factors': 100}


In [115]:
model = SVDpp(n_epochs=100, lr_all=0.008, n_factors=100)
model.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7f2cd1497760>

In [116]:
pred = model.test(test)
accuracy.rmse(pred)

RMSE: 1.0353


1.035329499891297

In [117]:
uid_list = [10556, 53, 13875, 8349, 5484, 9601, 6823, 16187, 4422, 12681]

hit_rate = calc_hit(model, uid_list, data_df, 10)
print(hit_rate)

0.43000000000000005


### NMF(Non-negative matrix factorization)

In [22]:
params = {'n_epochs': [20, 40, 60, 100],	# 연산 반복 횟수
          'n_factors': [5, 10, 30, 50, 100], # 잠재요인
}

gs = GridSearchCV(NMF, params, measures=['rmse', 'mae'], cv=5)
gs.fit(data)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

1.1423084580054166
{'n_epochs': 60, 'n_factors': 50}


In [112]:
model = NMF(n_epochs=60, n_factors=50)
model.fit(train)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f2cd14784f0>

In [113]:
pred = model.test(test)
accuracy.rmse(pred)

RMSE: 1.1214


1.121391764976302

In [114]:
uid_list = [10556, 53, 13875, 8349, 5484, 9601, 6823, 16187, 4422, 12681]

hit_rate = calc_hit(model, uid_list, data_df, 10)
print(hit_rate)

0.41
