In [13]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [14]:
df_ratings = pd.read_csv("file\\ml-1m\\ratings.dat", sep="::", header=None, names=["UserID", "MovieID", "Rating", "Timestamp"])
df_movies = pd.read_csv("file\\ml-1m\\movies.dat", sep="::", header=None, names=["MovieID", "Title", "Genres"])

In [15]:
df_ratings = df_ratings[df_ratings['Rating'] > 2]
df_ratings['Rating'] = 1
df_ratings

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,1,978300760
1,1,661,1,978302109
2,1,914,1,978301968
3,1,3408,1,978300275
4,1,2355,1,978824291
...,...,...,...,...
1000203,6040,1090,1,956715518
1000205,6040,1094,1,956704887
1000206,6040,562,1,956704746
1000207,6040,1096,1,956715648


In [16]:
x = df_ratings.copy()
x_train, x_test = train_test_split(x, test_size=0.05)

In [17]:
df_user_movie_ratings = x_train.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)
df_user_movie_ratings

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


k=13으로 하여 svd 실시

In [18]:
U, sigma, Vt = svds(df_user_movie_ratings, k=13)

In [19]:
print(U.shape)
print(sigma.shape)
print(Vt.shape)

(6039, 13)
(13,)
(13, 3615)


In [20]:
sigma = np.diag(sigma)

In [21]:
sigma

array([[ 58.135029  ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ],
       [  0.        ,  61.94239762,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ],
       [  0.        ,   0.        ,  65.21828571,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ],
       [  0.        ,   0.        ,   0.        ,  71.51117848,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ],
       [  0.        ,   0.        ,   0.        ,   0.        ,
         74.91148833,   0.        ,   0.        ,   0.        ,
          0.

In [22]:
svd_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)
df_svd_preds = pd.DataFrame(svd_user_predicted_ratings, columns=df_user_movie_ratings.columns)
df_svd_preds.index += 1
df_svd_preds

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
1,0.438751,0.107435,0.026836,0.008029,0.023614,-0.059650,0.027179,0.028121,-0.022184,0.006450,...,-0.000979,0.000398,0.004352,-0.006525,-0.015879,0.062727,0.016806,-0.006146,0.002499,0.002047
2,0.246456,0.087400,0.077198,0.025193,0.059011,0.273449,0.134614,0.006462,0.033644,0.291373,...,-0.000775,0.001448,-0.001603,0.010070,0.004577,0.123379,-0.000025,0.007302,0.002041,0.081459
3,0.278588,0.030764,0.039403,-0.009388,0.013927,0.063570,0.016718,0.005851,0.003072,0.109615,...,-0.001437,-0.000202,0.001008,0.001963,-0.005300,0.095821,0.003695,-0.007927,-0.004012,0.004731
4,0.072668,0.001673,-0.024424,-0.017007,-0.033882,0.075727,-0.028482,0.000361,-0.001071,0.064464,...,-0.006018,-0.000964,-0.000770,-0.002009,-0.002580,-0.005518,-0.007532,-0.003912,-0.003312,-0.006587
5,0.416770,-0.015963,-0.061281,0.010903,-0.055785,0.309547,-0.009795,-0.006800,-0.021297,0.020761,...,0.033293,-0.000746,-0.001608,-0.001019,0.011341,0.131254,0.129207,0.013452,0.017110,0.100096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,0.663826,0.275314,-0.114196,0.094367,-0.070905,0.467607,0.235539,0.002061,-0.035925,0.161457,...,0.069394,0.000049,-0.004339,-0.008599,0.063777,0.035069,0.256635,0.068506,0.055116,0.244908
6036,0.301247,-0.013040,0.022310,-0.013292,-0.016955,0.102624,0.009647,-0.013581,-0.007088,-0.060703,...,0.010008,0.001465,-0.002124,0.002439,0.028448,0.090853,0.042753,0.018736,0.004690,0.067351
6037,0.103321,0.004657,0.005268,0.002521,0.002050,-0.021391,0.025250,0.000617,-0.006394,-0.004663,...,-0.001689,0.000054,-0.000222,-0.003009,-0.001422,0.002447,-0.009693,-0.003740,-0.000113,-0.008726
6038,0.530533,0.125941,0.062544,0.007416,0.038348,-0.139440,0.114044,0.025047,-0.012237,0.065508,...,0.011464,0.004196,0.002825,0.002640,0.020450,0.045772,-0.016111,-0.000264,0.005554,-0.030582


적절한 값 기준으로 하여 봤다고 예측, recall 계산

In [24]:
df_svd_preds.index

RangeIndex(start=1, stop=6040, step=1)

In [26]:
# 0.5 기준
cnt = 0
for i in x_test.index:
    if x_test['MovieID'][i] in df_svd_preds.columns:
        if x_test['UserID'][i] in df_svd_preds.index:
            if (df_svd_preds[x_test['MovieID'][i]][x_test['UserID'][i]] >= 0.5):
                cnt += 1

cnt/len(x_test)

0.16913733741392503

In [27]:
# 해당 user의 평균 기준
cnt = 0
for i in x_test.index:
    if x_test['MovieID'][i] in df_svd_preds.columns:
        if x_test['UserID'][i] in df_svd_preds.index:
            if (df_svd_preds[x_test['MovieID'][i]][x_test['UserID'][i]] >= np.mean(df_svd_preds.loc[x_test['UserID'][i]])):
                cnt += 1

cnt/len(x_test)

0.7783330145371079

In [28]:
# 해당 user의 중앙값 기준
cnt = 0
for i in x_test.index:
    if x_test['MovieID'][i] in df_svd_preds.columns:
        if x_test['UserID'][i] in df_svd_preds.index:
            if (df_svd_preds[x_test['MovieID'][i]][x_test['UserID'][i]] >= np.median(df_svd_preds.loc[x_test['UserID'][i]])):
                cnt += 1

cnt/len(x_test)

0.8696920428462127

k=20으로 하여 svd 실시

In [30]:
U2, sigma2, Vt2 = svds(df_user_movie_ratings, k=20)
sigma2 = np.diag(sigma2)

svd_user_predicted_ratings2 = np.dot(np.dot(U2, sigma2), Vt2)
df_svd_preds2 = pd.DataFrame(svd_user_predicted_ratings2, columns=df_user_movie_ratings.columns)
df_svd_preds2.index += 1

x2 = df_ratings.copy()
x2_train, x2_test = train_test_split(x2, test_size=0.05)

적절한 값 기준으로 하여 봤다고 예측, recall 계산

In [31]:
# 0.5 기준
cnt = 0
for i in x2_test.index:
    if x2_test['MovieID'][i] in df_svd_preds2.columns:
        if x2_test['UserID'][i] in df_svd_preds2.index:
            if (df_svd_preds2[x2_test['MovieID'][i]][x2_test['UserID'][i]] >= 0.5):
                cnt += 1

cnt/len(x2_test)

0.2047149961744453

In [32]:
# 해당 user의 평균 기준
cnt = 0
for i in x2_test.index:
    if x2_test['MovieID'][i] in df_svd_preds2.columns:
        if x2_test['UserID'][i] in df_svd_preds2.index:
            if (df_svd_preds2[x2_test['MovieID'][i]][x2_test['UserID'][i]] >= np.mean(df_svd_preds2.loc[x2_test['UserID'][i]])):
                cnt += 1

cnt/len(x2_test)

0.7866296863045141

In [33]:
# 해당 user의 중앙값 기준
cnt = 0
for i in x2_test.index:
    if x2_test['MovieID'][i] in df_svd_preds2.columns:
        if x2_test['UserID'][i] in df_svd_preds2.index:
            if (df_svd_preds2[x2_test['MovieID'][i]][x2_test['UserID'][i]] >= np.median(df_svd_preds2.loc[x2_test['UserID'][i]])):
                cnt += 1

cnt/len(x2_test)

0.8744022570772763

k가 클수록 recall 높음, 기준을 평균/중앙값으로 했더니 recall 크게 증가(90% 이상)

In [204]:
print(np.mean(df_svd_preds.loc[520]))
print(np.median(df_svd_preds.loc[520]))

0.09855228856881962
0.043140503246548834


평균/중앙값 대부분 0.1 이하로 매우 작음 -> 0에 가까운데 봤다고 하는 기준으로 삼아도 되는지 고민
(precision 계산해보고 결정? 영화 추천 시 recall만 높으면 상관 없는지?)