In [1]:
import numpy as np
import scipy
import implicit

print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.21.4
1.7.1
0.4.8


In [2]:
import pandas as pd
import os
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
using_cols = ['user_id', 'movie_id', 'ratings']
ratings = ratings[using_cols]
ratings.tail(10)

Unnamed: 0,user_id,movie_id,ratings
1000199,6040,2022,5
1000200,6040,2028,5
1000201,6040,1080,4
1000202,6040,1089,4
1000203,6040,1090,3
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4
1000208,6040,1097,4


In [4]:
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [5]:
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [6]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [7]:
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movies['title'] = movies['title'].str.lower() 
movies.head(10)

Unnamed: 0,movie_id,title,genre
0,1,toy story (1995),Animation|Children's|Comedy
1,2,jumanji (1995),Adventure|Children's|Fantasy
2,3,grumpier old men (1995),Comedy|Romance
3,4,waiting to exhale (1995),Comedy|Drama
4,5,father of the bride part ii (1995),Comedy
5,6,heat (1995),Action|Crime|Thriller
6,7,sabrina (1995),Comedy|Romance
7,8,tom and huck (1995),Adventure|Children's
8,9,sudden death (1995),Action
9,10,goldeneye (1995),Action|Adventure|Thriller


In [9]:
movie_ratings = pd.merge(ratings, movies)
movie_ratings = movie_ratings[movie_ratings.counts > 2]

In [10]:
movie_ratings.head()

Unnamed: 0,user_id,movie_id,counts,title,genre
0,1,1193,5,one flew over the cuckoo's nest (1975),Drama
1,2,1193,5,one flew over the cuckoo's nest (1975),Drama
2,12,1193,4,one flew over the cuckoo's nest (1975),Drama
3,15,1193,4,one flew over the cuckoo's nest (1975),Drama
4,17,1193,5,one flew over the cuckoo's nest (1975),Drama


In [11]:
condition = (movie_ratings['user_id']== movie_ratings.loc[0, 'user_id'])
movie_ratings.loc[condition]

Unnamed: 0,user_id,movie_id,counts,title,genre
0,1,1193,5,one flew over the cuckoo's nest (1975),Drama
1680,1,661,3,james and the giant peach (1996),Animation|Children's|Musical
2123,1,914,3,my fair lady (1964),Musical|Romance
2734,1,3408,4,erin brockovich (2000),Drama
3957,1,2355,5,"bug's life, a (1998)",Animation|Children's|Comedy
5556,1,1197,3,"princess bride, the (1987)",Action|Adventure|Comedy|Romance
7808,1,1287,5,ben-hur (1959),Action|Adventure|Drama
8474,1,2804,5,"christmas story, a (1983)",Comedy|Drama
9764,1,594,4,snow white and the seven dwarfs (1937),Animation|Children's|Musical
10471,1,919,4,"wizard of oz, the (1939)",Adventure|Children's|Drama|Musical


In [12]:
movie_ratings['user_id'].nunique()

6039

In [13]:
movie_ratings['title'].nunique()

3628

In [14]:
movie_count = movie_ratings.groupby('title')['user_id'].count()
movie_count.sort_values(ascending=False).head(30)

title
american beauty (1999)                                   3211
star wars: episode iv - a new hope (1977)                2910
star wars: episode v - the empire strikes back (1980)    2885
star wars: episode vi - return of the jedi (1983)        2716
saving private ryan (1998)                               2561
terminator 2: judgment day (1991)                        2509
silence of the lambs, the (1991)                         2498
raiders of the lost ark (1981)                           2473
back to the future (1985)                                2460
matrix, the (1999)                                       2434
jurassic park (1993)                                     2413
sixth sense, the (1999)                                  2385
fargo (1996)                                             2371
braveheart (1995)                                        2314
men in black (1997)                                      2297
schindler's list (1993)                                  2257
pr

In [15]:
user_count = movie_ratings.groupby('user_id')['title'].count()
user_count.describe()

count    6039.000000
mean      138.512668
std       156.241599
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: title, dtype: float64

In [16]:
user_median = movie_ratings.groupby('user_id')['counts'].median()
user_median.describe()

count    6039.000000
mean        4.055970
std         0.432143
min         3.000000
25%         4.000000
50%         4.000000
75%         4.000000
max         5.000000
Name: counts, dtype: float64

In [17]:
movie_ratings[movie_ratings['title'] == 'matrix, the (1999)']

Unnamed: 0,user_id,movie_id,counts,title,genre
122219,2,2571,4,"matrix, the (1999)",Action|Sci-Fi|Thriller
122220,5,2571,5,"matrix, the (1999)",Action|Sci-Fi|Thriller
122221,7,2571,5,"matrix, the (1999)",Action|Sci-Fi|Thriller
122222,8,2571,5,"matrix, the (1999)",Action|Sci-Fi|Thriller
122223,9,2571,5,"matrix, the (1999)",Action|Sci-Fi|Thriller
...,...,...,...,...,...
124648,6030,2571,5,"matrix, the (1999)",Action|Sci-Fi|Thriller
124649,6031,2571,5,"matrix, the (1999)",Action|Sci-Fi|Thriller
124650,6035,2571,5,"matrix, the (1999)",Action|Sci-Fi|Thriller
124651,6036,2571,3,"matrix, the (1999)",Action|Sci-Fi|Thriller


In [18]:
my_favorite = ['as good as it gets (1997)' , 'godfather, the (1972)' ,'once upon a time in america (1984)' ,'blade runner (1982)' ,'matrix, the (1999)']
favorite_id = ['1784' , '858' ,'1227' ,'608' ,'2571']
favorite_genre = ['Comedy|Drama', 'Action|Crime|Drama', 'Crime|Drama|Thriller', 'Film-Noir|Sci-Fi', 'Action|Sci-Fi|Thriller']
my_playlist = pd.DataFrame({'user_id': ['zimin']*5, 'movie_id': favorite_id,
                            'counts':[30]*5, 'title': my_favorite, 'genre': favorite_genre})

if not movie_ratings.isin({'user_id':['zimin']})['user_id'].any():  # user_id에 'zimin'이라는 데이터가 없다면
    movie_ratings = movie_ratings.append(my_playlist)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

movie_ratings.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,movie_id,counts,title,genre
836473,5851,3607,5,one little indian (1973),Comedy|Drama|Western
836474,5854,3026,4,slaughterhouse (1987),Horror
836475,5854,690,3,"promise, the (versprechen, das) (1994)",Romance
836476,5938,2909,4,"five wives, three secretaries and me (1998)",Documentary
836477,5948,1360,5,identification of a woman (identificazione di ...,Drama
0,zimin,1784,30,as good as it gets (1997),Comedy|Drama
1,zimin,858,30,"godfather, the (1972)",Action|Crime|Drama
2,zimin,1227,30,once upon a time in america (1984),Crime|Drama|Thriller
3,zimin,608,30,blade runner (1982),Film-Noir|Sci-Fi
4,zimin,2571,30,"matrix, the (1999)",Action|Sci-Fi|Thriller


In [19]:
user_unique = movie_ratings['user_id'].unique()
movie_unique = movie_ratings['title'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [20]:
print(user_to_idx['zimin'])     
print(movie_to_idx['once upon a time in america (1984)'])

6039
1407


In [21]:
temp_user_data = movie_ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(movie_ratings):  
    print('user_id column indexing OK!!')
    movie_ratings['user_id'] = temp_user_data    
else:
    print('user_id column indexing Fail!!')

temp_movie_data = movie_ratings['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(movie_ratings):
    print('movie column indexing OK!!')
    movie_ratings['title'] = temp_movie_data
else:
    print('movie column indexing Fail!!')

movie_ratings

user_id column indexing OK!!
movie column indexing OK!!


Unnamed: 0,user_id,movie_id,counts,title,genre
0,0,1193,5,0,Drama
1,1,1193,5,0,Drama
2,2,1193,4,0,Drama
3,3,1193,4,0,Drama
4,4,1193,5,0,Drama
...,...,...,...,...,...
0,6039,1784,30,154,Comedy|Drama
1,6039,858,30,607,Action|Crime|Drama
2,6039,1227,30,1407,Crime|Drama|Thriller
3,6039,608,30,680,Film-Noir|Sci-Fi


In [22]:
movie_ratings.columns

Index(['user_id', 'movie_id', 'counts', 'title', 'genre'], dtype='object')

In [23]:
movie_ratings.counts

0     5
1     5
2     4
3     4
4     5
     ..
0    30
1    30
2    30
3    30
4    30
Name: counts, Length: 836483, dtype: int64

In [24]:
only_one = movie_ratings[movie_ratings['counts']<2]
one, all_data = len(only_one), len(movie_ratings)
print(f'{one},{all_data}')
print(f'Ratio of only_one over all data is {one/all_data:.2%}')  # f-format에 대한 설명은 https://bit.ly/2DTLqYU

0,836483
Ratio of only_one over all data is 0.00%


In [25]:
from scipy.sparse import csr_matrix

num_user = movie_ratings['user_id'].nunique()
num_movie = movie_ratings['title'].nunique()

csr_data = csr_matrix((movie_ratings.counts, (movie_ratings.user_id, movie_ratings.title)), shape= (num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [26]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [27]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [28]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [29]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [30]:
zimin, america = user_to_idx['zimin'], movie_to_idx['once upon a time in america (1984)']
zimin_vector, america_vector = als_model.user_factors[zimin], als_model.item_factors[america]

In [31]:
zimin_vector

array([-2.2755518e+00, -1.2966377e+00, -1.0982990e+00, -1.6151271e+00,
        1.8500346e-01,  2.1182661e-01,  5.5419600e-01,  3.8299736e-02,
       -2.5107113e-01, -1.2913274e+00,  6.4778501e-01,  4.8783913e-01,
       -9.3414411e-02, -1.6358194e+00, -9.1477461e-02,  5.7659513e-01,
        6.9620854e-01, -3.5776433e-01,  6.3075316e-01,  9.2862672e-01,
        3.6962327e-01,  9.4797611e-01, -7.7773476e-01, -2.0011292e+00,
       -1.8518471e+00,  8.6478466e-01,  2.9083112e-01,  5.2862900e-01,
        1.6774682e+00,  1.3689740e+00, -2.8816396e-01,  6.3629419e-01,
       -7.4436665e-01,  3.9885315e-01, -1.5275006e+00,  1.1477264e+00,
        7.1909308e-01,  2.5659392e+00, -7.2915572e-01,  1.1058971e+00,
        1.7129760e+00, -1.0874630e+00, -1.2361755e+00,  7.8751111e-01,
        7.1193641e-01, -5.1178300e-01, -6.0271245e-01,  3.5402510e-01,
       -1.8875007e-01, -1.1123684e-01,  7.7714330e-01,  1.2907203e+00,
       -3.6486900e-01, -2.2754022e-03, -2.2968487e-01,  4.2790177e-01,
      

In [32]:
america_vector

array([-0.00132404, -0.01836432, -0.01231884, -0.00698211,  0.00622436,
       -0.00362379,  0.00409256,  0.00765725,  0.00251734, -0.00976704,
        0.02298606, -0.01254763, -0.00727373, -0.00215911, -0.01321287,
        0.01715865,  0.01531229,  0.01100184, -0.00158316,  0.00934806,
        0.00100686,  0.00577333,  0.00500001, -0.00922517, -0.00440146,
        0.00457089,  0.01513418,  0.01354502,  0.00694607,  0.00870995,
        0.0003255 ,  0.019883  ,  0.00994274,  0.01259371,  0.00133261,
        0.01494505,  0.00887176,  0.02202391,  0.00345467, -0.0095257 ,
        0.01217984, -0.00356498,  0.00759568,  0.01428647,  0.00816618,
        0.00205566,  0.00328688,  0.01489955,  0.00748761,  0.00809838,
        0.00538647,  0.00819517,  0.00742441, -0.01487918, -0.00864963,
       -0.00557363, -0.01231755, -0.00018344,  0.01549898,  0.00373526,
        0.00068775,  0.0059807 ,  0.00680821, -0.00127264,  0.00394966,
        0.00447227,  0.01279117, -0.00368339, -0.00906824,  0.01

In [33]:
np.dot(zimin_vector, america_vector)

0.61422

In [34]:
braveheart = movie_to_idx['braveheart (1995)']
braveheart_vector = als_model.item_factors[braveheart]
np.dot(zimin_vector, braveheart_vector)

0.22295947

In [35]:
favorite_movie = 'blade runner (1982)'
movie_id = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(680, 1.0),
 (651, 0.56585497),
 (1097, 0.54314625),
 (200, 0.53137815),
 (500, 0.4867825),
 (550, 0.48414105),
 (193, 0.48375216),
 (904, 0.47573587),
 (602, 0.46319008),
 (3156, 0.43949735),
 (91, 0.43355507),
 (1291, 0.42383102),
 (291, 0.42230898),
 (924, 0.42016488),
 (1096, 0.41781092)]

In [36]:
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]

['blade runner (1982)',
 'aliens (1986)',
 'brazil (1985)',
 'terminator, the (1984)',
 'chinatown (1974)',
 'dr. strangelove or: how i learned to stop worrying and love the bomb (1963)',
 'alien (1979)',
 'clockwork orange, a (1971)',
 '2001: a space odyssey (1968)',
 'running free (2000)',
 'close encounters of the third kind (1977)',
 'blood simple (1984)',
 'maltese falcon, the (1941)',
 'dark city (1998)',
 'manchurian candidate, the (1962)']

In [37]:
def get_similar_movie(movie_name: str):
    movie_id = movie_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

In [38]:
get_similar_movie('men in black (1997)')

['men in black (1997)',
 'jurassic park (1993)',
 'terminator 2: judgment day (1991)',
 'total recall (1990)',
 'independence day (id4) (1996)',
 'matrix, the (1999)',
 'fifth element, the (1997)',
 'lost world: jurassic park, the (1997)',
 'schlafes bruder (brother of sleep) (1995)',
 'face/off (1997)']

In [39]:
get_similar_movie('pulp fiction (1994)')

['pulp fiction (1994)',
 'goodfellas (1990)',
 'fargo (1996)',
 'usual suspects, the (1995)',
 'silence of the lambs, the (1991)',
 'shawshank redemption, the (1994)',
 'reservoir dogs (1992)',
 'good will hunting (1997)',
 'seven (se7en) (1995)',
 'l.a. confidential (1997)']

In [40]:
user = user_to_idx['zimin']
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(380, 0.82165474),
 (200, 0.7645983),
 (92, 0.68373126),
 (64, 0.61463004),
 (651, 0.6009196),
 (117, 0.5380462),
 (435, 0.4774961),
 (224, 0.47028345),
 (44, 0.46735996),
 (317, 0.46000227),
 (160, 0.4555472),
 (62, 0.45448187),
 (602, 0.4431426),
 (193, 0.43977505),
 (882, 0.42837092),
 (175, 0.4136569),
 (865, 0.41176844),
 (121, 0.3937),
 (85, 0.38234362),
 (500, 0.37810436)]

In [41]:
[idx_to_movie[i[0]] for i in movie_recommended]

['godfather: part ii, the (1974)',
 'terminator, the (1984)',
 'terminator 2: judgment day (1991)',
 'star wars: episode vi - return of the jedi (1983)',
 'aliens (1986)',
 'star wars: episode v - the empire strikes back (1980)',
 'godfather: part iii, the (1990)',
 'l.a. confidential (1997)',
 'star wars: episode iv - a new hope (1977)',
 'twelve monkeys (1995)',
 'forrest gump (1994)',
 'total recall (1990)',
 '2001: a space odyssey (1968)',
 'alien (1979)',
 'mad max 2 (a.k.a. the road warrior) (1981)',
 'men in black (1997)',
 'predator (1987)',
 'silence of the lambs, the (1991)',
 'league of their own, a (1992)',
 'chinatown (1974)']

In [42]:
rihanna = movie_to_idx['chinatown (1974)']
explain = als_model.explain(user, csr_data, itemid=rihanna)

In [43]:
[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

[('blade runner (1982)', 0.33959450567365285),
 ('godfather, the (1972)', 0.12319358408879606),
 ('as good as it gets (1997)', 0.021295412792672885),
 ('once upon a time in america (1984)', -0.02698263485985065),
 ('matrix, the (1999)', -0.0768858514215374)]

In [44]:
#회고
# - 프로젝트를 진행하면서 알아낸 점 혹은 아직 모호한 점
# : np.dot의 수치를 획기적으로 높이고 싶은데, 잘 안된다. 지난 시도들을 통해 0.84까지 높여보았으나 되려 추천의 결과가 떨어져 현재로 고정했다.
# - 루브릭 평가 지표를 맞추기 위해 시도한 것들:
# : merge를 이용해 ratings와 movie를 하나로 합했고, timestep은 제거했다. 
# - 만약에 루브릭 평가 관련 지표를 달성 하지 못했을 때, 이유에 관한 추정:
# : 달성했다.
# - 자기 다짐:
# : 아자아자