## 1. Dataset

In [29]:
r_df = pd.read_csv("ratings_small.csv")
r_df.drop("timestamp", axis=1, inplace=True)
r_df.tail()

Unnamed: 0,userId,movieId,rating
99999,671,6268,2.5
100000,671,6269,4.0
100001,671,6365,4.0
100002,671,6385,2.5
100003,671,6565,3.5


## 2. Check Variance

In [30]:
uni_user = r_df["userId"].unique()
len(uni_user)

671

In [31]:
uni_movie = r_df["movieId"].unique()
len(uni_movie)

9066

In [32]:
uni_rating = r_df["rating"].value_counts().sort_index()
uni_rating

0.5     1101
1.0     3326
1.5     1687
2.0     7271
2.5     4449
3.0    20064
3.5    10538
4.0    28750
4.5     7723
5.0    15095
Name: rating, dtype: int64

In [33]:
# rating 분포

r_df.groupby("rating").size().reset_index(name="rating_counts")

Unnamed: 0,rating,rating_counts
0,0.5,1101
1,1.0,3326
2,1.5,1687
3,2.0,7271
4,2.5,4449
5,3.0,20064
6,3.5,10538
7,4.0,28750
8,4.5,7723
9,5.0,15095


In [43]:
# user 분포

user_counts_df = r_df.groupby("userId").size().reset_index(name="user_rating_counts")
user_counts_df = user_counts_df.sort_values("user_rating_counts", ascending=False)
user_counts_df.head()

Unnamed: 0,userId,user_rating_counts
546,547,2391
563,564,1868
623,624,1735
14,15,1700
72,73,1610


In [51]:
# movie 분포

movie_counts_df = r_df.groupby("movieId").size().reset_index(name="movie_rating_counts")
movie_counts_df = movie_counts_df.sort_values("movie_rating_counts", ascending=False)
movie_counts_df.head()

Unnamed: 0,movieId,movie_rating_counts
321,356,341
266,296,324
284,318,311
525,593,304
232,260,291


## 3. Preprocessing

In [71]:
# user 최소 평가수, movie 최소 평가수
u_limit, m_limit = 365, 100

In [72]:
filtered_userId = user_counts_df[user_counts_df["user_rating_counts"] > u_limit]
filtered_userId = list(filtered_userId["userId"])
len(filtered_userId), filtered_userId[:5]

(59, [547, 564, 624, 15, 73])

In [73]:
filtered_movieId = movie_counts_df[movie_counts_df["movie_rating_counts"] > m_limit]
filtered_movieId = list(filtered_movieId["movieId"])
len(filtered_movieId), filtered_movieId[:5]

(149, [356, 296, 318, 593, 260])

In [78]:
filtered_df = r_df[r_df["userId"].isin(filtered_userId)]
len(filterd_df)

filtered_df = filtered_df[filtered_df["movieId"].isin(filtered_movieId)]
len(filterd_df)

5570

## 4. Pivot

In [84]:
u_df = filtered_df.pivot_table(values="rating", index=["userId"], columns=["movieId"], 
                               aggfunc=np.average, fill_value=0, dropna=False)
u_df.tail()

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,4.0,0.0,3.5,0.0,0.0,3.0,0.0,0.0,0.0,4.5,...,4.5,3.5,4.5,4.0,4.0,4.0,4.0,4.0,4.5,4.0
624,5.0,3.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,4.0,...,3.5,3.5,4.0,4.5,0.0,3.5,4.5,3.5,3.5,4.0
654,5.0,3.0,0.0,4.0,0.0,5.0,4.5,4.5,0.0,4.5,...,5.0,4.5,4.5,5.0,4.0,4.0,5.0,4.5,0.0,0.0
664,3.5,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,4.5,...,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.5,5.0
665,0.0,3.0,0.0,0.0,0.0,4.0,2.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 5. Function

In [92]:
def cosine_similarity(v_1,v_2):
    
    idx = v_1.nonzero()[0]
    if len(idx) == 0:
        return -1

    v_1 = np.array(v_1)[idx]
    v_2 = np.array(v_2)[idx]
    
    idx = v_2.nonzero()[0]
    if len(idx) == 0:
        return -1

    v_1 = np.array(v_1)[idx]
    v_2 = np.array(v_2)[idx]
    
#     return v_1,v_2
    return 1-sp.spatial.distance.cosine(v_1,v_2)

In [93]:
# test code cosine_similarity
v_1 = np.array([1,0,3,0,5])
v_2 = np.array([5,3,0,1,5])

cosine_similarity(v_1,v_2)

0.8320502943378437

In [94]:
cosine_similarity(u_df.loc[15],u_df.loc[19])

0.9501250301799182

In [97]:
def similarity_matrix(u_df, similarity_func):
    
    users = u_df.index
    
    df = u_df.T
    
    matrix = []
    for idx_1, value_1 in df.items():
        row=[]
        for idx_2, value_2 in df.items():
            row.append(similarity_func(value_1, value_2))
        matrix.append(row)
        
    return pd.DataFrame(matrix, index=users, columns=users)

In [103]:
# test code similarity_matrix
sm_df = similarity_matrix(u_df, cosine_similarity)
sm_df.tail()

userId,15,19,23,30,48,56,73,102,105,119,...,580,587,596,605,607,615,624,654,664,665
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,0.913701,0.986879,0.977971,0.976351,0.990701,0.986844,0.977249,0.977529,0.981156,0.975449,...,0.98418,0.982973,0.984113,0.945157,0.979301,1.0,0.977023,0.991736,0.991745,0.97756
624,0.933455,0.973056,0.974726,0.970812,0.977897,0.964744,0.968863,0.967962,0.974071,0.964382,...,0.967887,0.974954,0.970422,0.949962,0.968329,0.977023,1.0,0.978762,0.977336,0.959331
654,0.917356,0.979269,0.981476,0.978836,0.987746,0.977249,0.97678,0.976687,0.983877,0.977521,...,0.983758,0.97794,0.981081,0.955068,0.98024,0.991736,0.978762,1.0,0.993751,0.973894
664,0.930106,0.979273,0.985208,0.974926,0.993049,0.976032,0.982777,0.976597,0.982344,0.982795,...,0.98368,0.97312,0.988273,0.956637,0.988647,0.991745,0.977336,0.993751,1.0,0.974557
665,0.903008,0.95424,0.967124,0.951942,0.97652,0.94866,0.962269,0.937813,0.964187,0.967379,...,0.962619,0.955417,0.954341,0.92442,0.960054,0.97756,0.959331,0.973894,0.974557,1.0


In [105]:
def mean_score(u_df, sm_df, target, closer_count):
    
    sms_df = sm_df.drop(target)
    sms_df = sms_df.sort_values(target, ascending=False)
    sms_df = sms_df[target][:closer_count]
    
    smsw_df = u_df.loc[sms_df.index]
    
    ms_df = pd.DataFrame(columns=u_df.columns)
    ms_df.loc["user"] = u_df.loc[target]
    ms_df.loc["mean"] = smsw_df.mean()
    
    return ms_df

In [112]:
# test code mean_score
ms_df = mean_score(u_df, sm_df, 48, 10)
ms_df

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
user,4.0,3.5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,4.0,4.0,0.0,4.0,4.5,0.0,3.5,0.0,4.0,4.0
mean,3.6,1.65,1.5,1.5,1.2,2.1,1.55,0.95,1.55,2.95,...,2.6,2.85,2.4,3.05,2.35,2.35,3.0,2.6,1.4,1.75


In [113]:
#recommend
def recomment(ms_df):
    
    recomment_df = ms_df.T
    recomment_df = recomment_df[recomment_df["user"] == 0]
    recomment_df = recomment_df.sort_values("mean", ascending=False)
    
    return recomment_df, list(recomment_df.index)   

In [114]:
# test code recommend
recomment_df , recomment_list = recomment(ms_df)
print(recomment_list)
recomment_df.head()

[260, 1198, 1291, 318, 1036, 1196, 1210, 1265, 1136, 2028, 589, 380, 2502, 50, 457, 1197, 1704, 2916, 2791, 2918, 377, 733, 780, 2858, 47, 1617, 2396, 597, 1213, 590, 608, 2797, 1968, 592, 1517, 2628, 33794, 2683, 2716, 919, 165, 4963, 1393, 1259, 6874, 1387, 292, 1961, 7438, 5445, 1089, 454, 32, 316, 349, 4896, 1200, 1073, 500, 587, 736, 3996, 588, 1193, 539, 150, 1221, 912, 253, 185, 329, 339, 750, 153, 39, 586, 10, 6, 2174, 1208, 208, 25, 357, 36, 434]


Unnamed: 0_level_0,user,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
260,0.0,4.4
1198,0.0,4.35
1291,0.0,4.25
318,0.0,4.0
1036,0.0,3.95


In [156]:
# MAE
def mae(value, pre):
    
    idx = value.nonzero()[0]
    if len(idx) == 0:
        return -1
    value = np.array(value)[idx]
    pre = np.array(pre)[idx]
    
    idx = pre.nonzero()[0]
    if len(idx) == 0:
        return -1
    value = np.array(value)[idx]
    pre = np.array(pre)[idx]
    
    return sum(np.abs(value - pre) / len(idx))

In [157]:
# test code MAE
mae(ms_df.loc["user"], ms_df.loc["mean"])

1.2492187500000003

In [158]:
def evaluate(u_df, sm_df, closer_count, algorithm):
    
    users = u_df.index
    evaluate_list = []
    
    for target in users:
        pre_df = mean_score(u_df, sm_df, target, closer_count)
        evaluate_list.append(algorithm(pre_df.loc["user"], pre_df.loc["mean"]))
    
    return np.average(evaluate_list)

In [159]:
# test code MAE
evaluate(u_df, sm_df, 10, mae)

1.4807110633380403

In [162]:
start, end = 2, 10

for closer_count in range(start, end+1):
    print(closer_count, evaluate(u_df, sm_df, closer_count, mae))

2 1.304871232477446
3 1.489161802164799
4 1.5568822047461943
5 1.5484965993474271
6 1.5375646916686692
7 1.5316502433082961
8 1.5144641478552179
9 1.4995695256828276
10 1.4807110633380403
