In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Dataset 불러오기

In [3]:
path = '/content/drive/MyDrive/data/movielens/'

ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

## Genres를 이용한 movie representation

In [4]:
total_count = len(movies_df.index)
total_genres = list(set([genres for sublist in list(map(lambda x: x.split('|'), movies_df['genres'])) for genres in sublist]))

In [5]:
print(f'전체 영화 수: {total_count}')
print(f'장르: {total_genres}')

전체 영화 수: 9742
장르: ['(no genres listed)', 'War', 'Animation', 'IMAX', 'Mystery', 'Western', 'Adventure', 'Crime', 'Action', 'Thriller', 'Sci-Fi', 'Romance', 'Musical', 'Film-Noir', 'Comedy', 'Children', 'Fantasy', 'Drama', 'Horror', 'Documentary']


In [6]:
print(len(total_genres))

20


In [7]:
genres_count  = dict.fromkeys(total_genres)

for each_genres_list in movies_df['genres']:
  for genres in each_genres_list.split('|'):
    if genres_count[genres] == None:
      genres_count[genres] = 1
    else:
      genres_count[genres] += 1

In [8]:
genres_count

{'(no genres listed)': 34,
 'War': 382,
 'Animation': 611,
 'IMAX': 158,
 'Mystery': 573,
 'Western': 167,
 'Adventure': 1263,
 'Crime': 1199,
 'Action': 1828,
 'Thriller': 1894,
 'Sci-Fi': 980,
 'Romance': 1596,
 'Musical': 334,
 'Film-Noir': 87,
 'Comedy': 3756,
 'Children': 664,
 'Fantasy': 779,
 'Drama': 4361,
 'Horror': 978,
 'Documentary': 440}

In [9]:
# 장르별 가중치 계산 IDF
for each_genres in genres_count:
  genres_count[each_genres] = np.log10(total_count/genres_count[each_genres])

In [10]:
genres_count

{'(no genres listed)': 2.457169208193496,
 'War': 1.4065847623240424,
 'Animation': 1.2026069149931968,
 'IMAX': 1.7899910382813284,
 'Mystery': 1.2304935032683613,
 'Western': 1.7659316540881678,
 'Adventure': 0.8872447746804204,
 'Crime': 0.9098289421369025,
 'Action': 0.7266719338379385,
 'Thriller': 0.7112681505684965,
 'Sci-Fi': 0.9974220495432562,
 'Romance': 0.7856152382210405,
 'Musical': 1.4649016584241867,
 'Film-Noir': 2.0491288726171324,
 'Comedy': 0.41392254164167785,
 'Children': 1.1664800458677336,
 'Fantasy': 1.0971106675631868,
 'Drama': 0.3490620385623247,
 'Horror': 0.9983092704481497,
 'Documentary': 1.3451954487495636}

In [11]:
dict_temp = {i: genres_count[i] for i in movies_df.iloc[0,:]['genres'].split('|')}
dict_temp

{'Adventure': 0.8872447746804204,
 'Animation': 1.2026069149931968,
 'Children': 1.1664800458677336,
 'Comedy': 0.41392254164167785,
 'Fantasy': 1.0971106675631868}

In [12]:
from operator import ge
# create genres representations
genres_representation = pd.DataFrame(columns=sorted(total_genres), index=movies_df.index)
for index, each_row in tqdm(movies_df.iterrows()):
  dict_temp = {i: genres_count[i] for i in each_row['genres'].split('|')}
  row_to_add = pd.DataFrame(dict_temp, index=[index])
  genres_representation.update(row_to_add)

genres_representation

9742it [00:56, 170.98it/s]


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,0.887245,1.202607,1.16648,0.413923,,,,1.097111,,,,,,,,,,
2,,,0.887245,,1.16648,,,,,1.097111,,,,,,,,,,
3,,,,,,0.413923,,,,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
5,,,,,,0.413923,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,0.726672,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193583,,,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193585,,,,,,,,,0.349062,,,,,,,,,,,
193587,,0.726672,,1.202607,,,,,,,,,,,,,,,,


## Tag를 이용한 Movie Representation

In [13]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [14]:
# get unique tag
tag_column = list(map(lambda x: x.split(','), tags_df['tag']))
unique_tags = list(set(list(map(lambda x: x.strip(), list([tag for sublist in tag_column for tag in sublist])))))

print(unique_tags)

['no dialogue', 'Hawkeye', 'hallucinatory', 'moody', 'police', 'dreamy', 'weird', 'Vulgar', 'gothic', 'Rogers and Hammerstein', 'Rogue', 'good cinematography', 'new composer', 'engrossing adventure', 'classic movie', 'Dogs', 'blood', 'shakespeare', 'morality', 'court', 'Michael Cera', 'bank', 'samurai', 'reflective', 'dreams', 'heartwarming', 'divorce', 'Psychological Thriller', 'gore', 'rabbi', 'updated classics', 'old', 'fighting', 'British', 'pool', 'Jason Segel', 'human rights', 'coulda been a contender', 'post-apocalyptic', 'Australia', 'drugs & music', 'Horror', 'depressing', 'villain nonexistent or not needed for good story', 'fantasy', 'class', 'virtual reality', 'passion', '1970s', 'foul language', 'music business', 'weddings', 'pop culture references', 'futuristic', 'art', 'alter ego', 'Afghanistan', 'space opera', 'Italy', 'awesome', 'domestic violence', 'amazing', 'cheesy', 'Klingons', 'Peter Pan', 'Viggo Mortensen', 'lawyer', 'ransom', 'happy ending', 'philosopical', 'clic

In [15]:
print(len(tag_column))
print(len(unique_tags))

3683
1589


In [16]:
# Compute IDF for tag
total_movie_count = len(set(tags_df['movieId']))
# key : tag, value: number of movies with such tag
tag_count_dict = dict.fromkeys(unique_tags)

for each_movie_tag_list in tags_df['tag']:
  for tag in each_movie_tag_list.split(','):
    if tag_count_dict[tag.strip()] == None:
      tag_count_dict[tag.strip()] = 1
    else:
      tag_count_dict[tag.strip()] += 1

tag_idf = dict()
for each_tag in tag_count_dict:
  tag_idf[each_tag] = np.log10(total_movie_count/tag_count_dict[each_tag])

tag_idf

{'no dialogue': 3.196452541703389,
 'Hawkeye': 3.196452541703389,
 'hallucinatory': 2.2422100322640643,
 'moody': 2.895422546039408,
 'police': 2.351354501689132,
 'dreamy': 3.196452541703389,
 'weird': 2.5943925503754266,
 'Vulgar': 3.196452541703389,
 'gothic': 2.4183012913197452,
 'Rogers and Hammerstein': 2.895422546039408,
 'Rogue': 3.196452541703389,
 'good cinematography': 3.196452541703389,
 'new composer': 3.196452541703389,
 'engrossing adventure': 3.196452541703389,
 'classic movie': 3.196452541703389,
 'Dogs': 3.196452541703389,
 'blood': 3.196452541703389,
 'shakespeare': 3.196452541703389,
 'morality': 2.895422546039408,
 'court': 2.1172712956557644,
 'Michael Cera': 2.7193312869837265,
 'bank': 3.196452541703389,
 'samurai': 2.4974825373673704,
 'reflective': 2.895422546039408,
 'dreams': 3.196452541703389,
 'heartwarming': 2.2933625547114453,
 'divorce': 2.2933625547114453,
 'Psychological Thriller': 3.196452541703389,
 'gore': 3.196452541703389,
 'rabbi': 3.19645254170

In [17]:
len(tag_idf.keys())

1589

In [18]:
tags_df.groupby(by='movieId').head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3677,606,6107,World War II,1178473747
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984


In [19]:
# Create movie representations
tag_representation = pd.DataFrame(columns=sorted(unique_tags), index=list(set(tags_df['movieId'])))

for name, group in tqdm(tags_df.groupby(by='movieId')):
  temp_list = list(map(lambda x: x.split(','), list(group['tag'])))
  temp_tag_list = list(set(list(map(lambda x: x.strip(), list([tag for sublist in temp_list for tag in sublist])))))

  dict_temp = {i: tag_idf[i.strip()] for i in temp_tag_list}
  row_to_add = pd.DataFrame(dict_temp, index=[group['movieId'].values[0]])
  tag_representation.update(row_to_add)
  
tag_representation = tag_representation.sort_index(0)
tag_representation


100%|██████████| 1572/1572 [04:32<00:00,  5.77it/s]
  tag_representation = tag_representation.sort_index(0)


Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183611,,,,,,,,,,,...,,,,,,,,,,
184471,,,,,,,,,,,...,,,,,,,,,,
187593,,,,,,,,,,,...,,,,,,,,,,
187595,,,,,,,,,,,...,,,,,,,,,,


In [20]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [21]:
tag_representation.loc[1].dropna()

fun      2.497483
pixar    2.895423
Name: 1, dtype: object

In [22]:
tag_representation.loc[2].dropna()

Robin Williams      2.719331
fantasy             2.418301
game                3.196453
magic board game    3.196453
Name: 2, dtype: object

In [23]:
print(genres_representation.shape)
print(tag_representation.shape)

(9742, 20)
(1572, 1589)


## Final Movie Representation
  * genres와 tag로 만들어진 representation을 합쳐서 각 movie의 vector로 만든다.

In [24]:
movie_representation = pd.concat([genres_representation, tag_representation], axis=1).fillna(0)
print(movie_representation.head())
print(movie_representation.describe())

   (no genres listed)  Action  Adventure  Animation  Children    Comedy  \
1                 0.0     0.0   0.887245   1.202607   1.16648  0.413923   
2                 0.0     0.0   0.887245   0.000000   1.16648  0.000000   
3                 0.0     0.0   0.000000   0.000000   0.00000  0.413923   
4                 0.0     0.0   0.000000   0.000000   0.00000  0.413923   
5                 0.0     0.0   0.000000   0.000000   0.00000  0.413923   

   Crime  Documentary     Drama   Fantasy  ...  women  wonderwoman  workplace  \
1    0.0          0.0  0.000000  1.097111  ...    0.0          0.0        0.0   
2    0.0          0.0  0.000000  1.097111  ...    0.0          0.0        0.0   
3    0.0          0.0  0.000000  0.000000  ...    0.0          0.0        0.0   
4    0.0          0.0  0.349062  0.000000  ...    0.0          0.0        0.0   
5    0.0          0.0  0.000000  0.000000  ...    0.0          0.0        0.0   

   writing  wrongful imprisonment  wry  younger men  zither  z

## Content 유사도 평가
  * Cosine similarity를 사용

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a, b):
  cos_sim = cosine_similarity(a, b)
  result_df = pd.DataFrame(data=cos_sim, index=[a.index])
  return result_df

In [26]:
cs_df = cos_sim_matrix(movie_representation, movie_representation)
cs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
1,1.0,0.124438,0.008403,0.040571,0.011755,0.0,0.016339,0.331122,0.0,0.131794,...,0.064466,0.260941,0.071492,0.27171,0.0,0.348295,0.379492,0.0,0.232553,0.093519
2,0.124438,1.0,0.0,0.0,0.0,0.0,0.0,0.240843,0.0,0.095861,...,0.0,0.0,0.0,0.0,0.0,0.108082,0.117763,0.0,0.0,0.0
3,0.008403,0.0,1.0,0.179391,0.011294,0.0,0.072246,0.0,0.0,0.0,...,0.00656,0.0,0.068686,0.0,0.0,0.020322,0.022142,0.0,0.0,0.089849
4,0.040571,0.0,0.179391,1.0,0.05453,0.0,0.348828,0.0,0.0,0.0,...,0.031674,0.101979,0.567487,0.0,0.0,0.098119,0.106908,0.365843,0.0,0.433821
5,0.011755,0.0,0.011294,0.05453,1.0,0.0,0.640342,0.0,0.0,0.0,...,0.009177,0.0,0.096091,0.0,0.0,0.028429,0.030976,0.0,0.0,0.125697


In [27]:
cs_df.shape

(9742, 9742)

In [28]:
print(cs_df.shape)
print(cs_df[1].sort_values(ascending=False))

(9742, 9742)
2         1.000000
46972     0.322201
158813    0.300850
119655    0.300850
80748     0.300850
            ...   
4921      0.000000
4920      0.000000
4919      0.000000
4917      0.000000
193609    0.000000
Name: 1, Length: 9742, dtype: float64


In [29]:
print(movies_df.loc[1])
print(movies_df.loc[46972])
print(movies_df.loc[158813])
print(movies_df.loc[119655])
print(movies_df.loc[80748])


title                                Toy Story (1995)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 1, dtype: object
title     Night at the Museum (2006)
genres    Action|Comedy|Fantasy|IMAX
Name: 46972, dtype: object
title     Alice Through the Looking Glass (2016)
genres                Adventure|Children|Fantasy
Name: 158813, dtype: object
title             Seventh Son (2014)
genres    Adventure|Children|Fantasy
Name: 119655, dtype: object
title     Alice in Wonderland (1933)
genres    Adventure|Children|Fantasy
Name: 80748, dtype: object


## 추천시스템의 성능 평가
  * 학습셋과 테스트셋을 나눈다.
  * 테스트셋에서 예측한 평점과 실제 평점의 RMSE를 구한다.

In [30]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=1234)

In [31]:
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


In [34]:
test_userids = list(set(test_df.userId.values))
test_userids

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [35]:
result_df = pd.DataFrame()

for user_id in tqdm(test_userids):
  user_record_df = train_df.loc[train_df.userId == int(user_id), :]

  user_sim_df = cs_df.loc[user_record_df['movieId']] # (n, 9742)
  user_rating_df = user_record_df[['rating']] #(n, 1)
  sim_sum = np.sum(user_sim_df.T.to_numpy(), -1)

  prediction = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (sim_sum + 1)

  prediction_df = pd.DataFrame(prediction, index=cs_df.index).reset_index()
  prediction_df.columns = ['movieId', 'pred_rating']
  prediction_df = prediction_df[['movieId', 'pred_rating']][prediction_df.movieId.isin(test_df[test_df.userId == user_id]['movieId'].values)]
  
  temp_df = prediction_df.merge(test_df[test_df.userId == user_id], on='movieId')
  result_df = pd.concat([result_df, temp_df], axis=0)

100%|██████████| 610/610 [00:13<00:00, 45.47it/s]


In [36]:
result_df.head(10)

Unnamed: 0,movieId,pred_rating,userId,rating,timestamp
0,1,4.145652,1,4.0,964982703
1,50,3.650755,1,5.0,964982931
2,216,2.670124,1,5.0,964981208
3,223,2.612844,1,3.0,964980985
4,231,4.215284,1,5.0,964981179
5,235,3.61982,1,4.0,964980908
6,316,4.136756,1,3.0,964982310
7,457,3.218743,1,5.0,964981909
8,543,3.729524,1,4.0,964981179
9,592,4.024728,1,4.0,964982271


In [39]:
mse = mean_squared_error(y_true=result_df['rating'].values, y_pred=result_df['pred_rating'].values)
rmse = np.sqrt(mse)
print(mse, rmse)

1.40606646706041 1.1857767357561078
