In [0]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

In [3]:
cols_name = ['user_id', 'item_id', 'rating', 'timestamp']
data_movie = pd.read_csv('u.data', names=cols_name, sep='\t')
print(data_movie.head())

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


In [5]:
movie_rating = data_movie.pivot(
    index='user_id', columns='item_id', values='rating'
).fillna(0).as_matrix()
print(movie_rating[0:5])
print(movie_rating.shape)

[[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [4. 3. 0. ... 0. 0. 0.]]
(943, 1682)


  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
cos_sim = cosine_similarity(movie_rating, movie_rating)
print(cos_sim[:5])
print(cos_sim.shape)

[[1.         0.16693098 0.04745954 ... 0.14861694 0.17950788 0.39817474]
 [0.16693098 1.         0.11059132 ... 0.16148478 0.17226781 0.10579788]
 [0.04745954 0.11059132 1.         ... 0.10124256 0.13341615 0.02655587]
 [0.06435782 0.17812119 0.34415072 ... 0.15204088 0.17008611 0.05875214]
 [0.37847518 0.07297896 0.02124453 ... 0.13959513 0.15249741 0.31394084]]
(943, 943)


In [7]:
# ユーザー1との類似度
cos_sim_for_user_1 = cos_sim[0]
# ユーザー1と類似度の高いユーザー10人のインデックスを抽出
similar_user = np.argsort(cos_sim_for_user_1)[-11:-1]
print(similar_user)

[275 302 428 737 456 434  91 267 863 915]


In [8]:
# 類似度一覧
print(cos_sim_for_user_1[similar_user])

[0.52452252 0.52571773 0.52594993 0.52703107 0.53847598 0.53866453
 0.54053356 0.54207705 0.54754826 0.56906573]


In [10]:
# 類似度の高いユーザーの映画評価値
movie_rating_of_similar_user = movie_rating[similar_user]
print(movie_rating_of_similar_user)

[[5. 4. 3. ... 0. 0. 0.]
 [5. 3. 3. ... 0. 0. 0.]
 [3. 3. 2. ... 0. 0. 0.]
 ...
 [3. 2. 1. ... 0. 0. 0.]
 [5. 4. 0. ... 0. 0. 0.]
 [4. 3. 3. ... 0. 0. 3.]]


In [11]:
# 重み付けされた評価値を計算
weighted_movie_rating = movie_rating_of_similar_user * cos_sim_for_user_1[similar_user].reshape(-1, 1)
print(weighted_movie_rating)

[[2.62261261 2.09809009 1.57356757 ... 0.         0.         0.        ]
 [2.62858867 1.5771532  1.5771532  ... 0.         0.         0.        ]
 [1.57784978 1.57784978 1.05189985 ... 0.         0.         0.        ]
 ...
 [1.62623114 1.0841541  0.54207705 ... 0.         0.         0.        ]
 [2.73774131 2.19019305 0.         ... 0.         0.         0.        ]
 [2.27626293 1.70719719 1.70719719 ... 0.         0.         1.70719719]]


In [12]:
# 各映画のレコメンド値を計算
mean_weighted_movie_rating = weighted_movie_rating.mean(axis=0)
print(mean_weighted_movie_rating)

[2.31138026 1.55919894 0.80678885 ... 0.         0.         0.17071972]


In [13]:
# ユーザー1の評価と加重平均スコアを列とするデータフレームを作成
recommend_values = pd.DataFrame({'user_1_score':movie_rating[0], 'recommend_value':mean_weighted_movie_rating})

# 未評価のうちスコアの高い上位10件を抽出
recommend_values[recommend_values['user_1_score'] == 0].sort_values('recommend_value', ascending=False).head(10)

Unnamed: 0,user_1_score,recommend_value
317,0.0,2.199688
473,0.0,2.100667
654,0.0,1.988316
422,0.0,1.985781
402,0.0,1.978832
356,0.0,1.974216
432,0.0,1.938533
384,0.0,1.875676
567,0.0,1.830542
469,0.0,1.770764
