In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error



In [None]:
path = '/content/drive/MyDrive/study/Recsys/data/movielens'

ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding = 'UTF-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding = 'UTF-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding = 'UTF-8')

In [None]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [None]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


## 1. 장르를 이용한 피쳐 생성

장르 총 개수?

In [None]:
splitted = list(map(lambda x: x.split('|'), movies_df['genres']))
total_genres = list(set( genre for sublist in splitted for genre in sublist))
total_count = len(movies_df.index)

print('전체 영화 수 : ', total_count)
print('전체 장르 수 : ', len(total_genres))

전체 영화 수 :  9742
전체 장르 수 :  20


장르별 등장 횟수

In [None]:
genre_count = dict.fromkeys(total_genres)

for each_genre_list in movies_df['genres']:
    for genre in each_genre_list.split('|'):
        if genre_count[genre] == None:
            genre_count[genre] = 1
        else:
            genre_count[genre   ] = genre_count[genre] +1

In [None]:
genre_count


{'War': 382,
 'Film-Noir': 87,
 'Action': 1828,
 'Animation': 611,
 'IMAX': 158,
 'Horror': 978,
 'Children': 664,
 'Crime': 1199,
 'Comedy': 3756,
 'Western': 167,
 'Adventure': 1263,
 'Sci-Fi': 980,
 'Documentary': 440,
 '(no genres listed)': 34,
 'Drama': 4361,
 'Thriller': 1894,
 'Musical': 334,
 'Mystery': 573,
 'Fantasy': 779,
 'Romance': 1596}

TF-IDF식으로 장르에 가중치 부여   
(많이 등장하는 것에 페널티, 유의미한 정보가 아니라는 뜻)

$$ y = log(전체 등장 수 / 본인 등장 수)$$

In [None]:
for each_genre in genre_count:
    genre_count[each_genre] = np.log10(total_count / genre_count[each_genre])

genre_count
                                       


{'War': 1.4065847623240424,
 'Film-Noir': 2.0491288726171324,
 'Action': 0.7266719338379385,
 'Animation': 1.2026069149931968,
 'IMAX': 1.7899910382813284,
 'Horror': 0.9983092704481497,
 'Children': 1.1664800458677336,
 'Crime': 0.9098289421369025,
 'Comedy': 0.41392254164167785,
 'Western': 1.7659316540881678,
 'Adventure': 0.8872447746804204,
 'Sci-Fi': 0.9974220495432562,
 'Documentary': 1.3451954487495636,
 '(no genres listed)': 2.457169208193496,
 'Drama': 0.3490620385623247,
 'Thriller': 0.7112681505684965,
 'Musical': 1.4649016584241867,
 'Mystery': 1.2304935032683613,
 'Fantasy': 1.0971106675631868,
 'Romance': 0.7856152382210405}

장르 값 갖는 데이터 프레임 만들기

In [None]:
genre_representation = pd.DataFrame(columns = sorted(total_genres), index = movies_df.index)
for index, each_row in tqdm(movies_df.iterrows()):
    # {Action : 1.44422, Comedy : 2.154}, 본인 포함 장르만
    dict_temp = {i: genre_count[i] for i in each_row['genres'].split('|')}
    # movie_id Action Comedy ...
    #   215      1.44   2.1  ..
    row_to_add = pd.DataFrame(dict_temp, index = [index])
    # 215인덱스 행만 업데이트
    genre_representation.update(row_to_add)

genre_representation.head()

9742it [01:23, 117.36it/s]


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,0.887245,1.202607,1.16648,0.413923,,,,1.097111,,,,,,,,,,
2,,,0.887245,,1.16648,,,,,1.097111,,,,,,,,,,
3,,,,,,0.413923,,,,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
5,,,,,,0.413923,,,,,,,,,,,,,,


## 2. Tag를 이용한 피쳐 생성

In [None]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [None]:
# get unique tag
tag_column = list(map(lambda x: x.split(','), tags_df['tag']))
unique_tags = list(set(list(map(lambda x: x.strip(), list([tag for sublist in tag_column for tag in sublist])))))

print(unique_tags)

['embarassing scenes', 'entertaining', 'ensemble cast', 'American propaganda', 'psychedelic', 'Jude Law', 'Jesse Eisenberg', 'camp', 'audience intelligence underestimated', 'funny', 'cult', 'Witty', 'Leonardo DiCaprio', 'Matrix', 'psychological', 'sexy', 'plot holes', 'British gangster', 'figure skating', 'assassination', 'Ben Affleck', 'tolkien', 'bad', '1970s', 'ocean', 'dance marathon', 'McDonalds', 'philosophy', 'organized crime', 'coen brothers', 'killer', 'great screenplay', 'bizarre', 'Kurt Russell', 'kids', 'Mindfuck', 'Tarantino', 'lovely', 'Epic', 'Suspenseful', 'moving', 'macho', 'Lolita theme', 'franchise', 'organised crime', 'off-beat comedy', 'rug', 'Disney', 'unnecessary sequel', 'Mrs. DeWinter', 'Great movie', 'longing', 'Not Seen', 'Nudity (Full Frontal)', 'Morgan Freeman', 'Ichabod Crane', 'police', 'small towns', 'biography', 'Tennessee Williams', 'missing children', 'transplants', 'pulp', 'Alfred Hitchcock', 'bad acting', 'Dumas', 'procedural', 'Mystery', 'Favelas',

In [None]:
print(len(tag_column))
print(len(unique_tags))

3683
1589


In [None]:
total_movie_count = len(set(tags_df['movieId']))

similarly, idf for tag

In [None]:
total_movie_count = len(set(tags_df['movieId']))

# {tag : # movies with key tag }
tag_count_dict = dict.fromkeys(unique_tags)

for each_movie_tag_list in tags_df['tag']:
    for tag in each_movie_tag_list.split(","):
        if tag_count_dict[tag.strip()] == None:
            tag_count_dict[tag.strip()] = 1
        else:
            tag_count_dict[tag.strip()] += 1

tag_idf = dict()
for each_tag in tag_count_dict:
    tag_idf[each_tag] = np.log10(total_movie_count / tag_count_dict[each_tag])

tag_idf


{'embarassing scenes': 3.196452541703389,
 'entertaining': 3.196452541703389,
 'ensemble cast': 2.7193312869837265,
 'American propaganda': 3.196452541703389,
 'psychedelic': 2.7193312869837265,
 'Jude Law': 2.895422546039408,
 'Jesse Eisenberg': 3.196452541703389,
 'camp': 3.196452541703389,
 'audience intelligence underestimated': 2.895422546039408,
 'funny': 1.8347247056857963,
 'cult': 2.895422546039408,
 'Witty': 3.196452541703389,
 'Leonardo DiCaprio': 2.196452541703389,
 'Matrix': 3.196452541703389,
 'psychological': 2.155059856545164,
 'sexy': 3.196452541703389,
 'plot holes': 2.5943925503754266,
 'British gangster': 3.196452541703389,
 'figure skating': 2.895422546039408,
 'assassination': 2.351354501689132,
 'Ben Affleck': 2.895422546039408,
 'tolkien': 3.196452541703389,
 'bad': 2.2933625547114453,
 '1970s': 2.7193312869837265,
 'ocean': 3.196452541703389,
 'dance marathon': 3.196452541703389,
 'McDonalds': 3.196452541703389,
 'philosophy': 2.4183012913197452,
 'organized cr

In [None]:
len(tag_idf.keys())

1589

In [None]:
tag_representation = pd.DataFrame(columns=sorted(unique_tags), index=list(set(tags_df['movieId'])))

for name, group in tqdm(tags_df.groupby(by = "movieId")):
    # [ [tags..], [tags,,] ]
    temp_list = list(map(lambda x: x.split(','), list(group['tag'])))
    # [ tags.. ]
    temp_tag_list = list(set(list(map(lambda x : x.strip(), list([tag for sublist in temp_list for tag in sublist])))))

    dict_temp = {i : tag_idf[i.strip()] for i in temp_tag_list}
    row_to_add = pd.DataFrame(dict_temp, index = [group['movieId'].values[0]])
    tag_representation.update(row_to_add)

tag_representation = tag_representation.sort_index(0)
tag_representation






100%|██████████| 1572/1572 [06:36<00:00,  3.97it/s]
  del sys.path[0]


Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183611,,,,,,,,,,,...,,,,,,,,,,
184471,,,,,,,,,,,...,,,,,,,,,,
187593,,,,,,,,,,,...,,,,,,,,,,
187595,,,,,,,,,,,...,,,,,,,,,,


In [None]:
## gerne + tag

movie_representation = pd.concat([genre_representation, tag_representation], axis = 1).fillna(0)

print(movie_representation.shape)
print(movie_representation.describe())

(9742, 1609)
       (no genres listed)       Action    Adventure    Animation     Children  \
count         9742.000000  9742.000000  9742.000000  9742.000000  9742.000000   
mean             0.008576     0.136354     0.115027     0.075425     0.079506   
std              0.144915     0.283726     0.298052     0.291593     0.293989   
min              0.000000     0.000000     0.000000     0.000000     0.000000   
25%              0.000000     0.000000     0.000000     0.000000     0.000000   
50%              0.000000     0.000000     0.000000     0.000000     0.000000   
75%              0.000000     0.000000     0.000000     0.000000     0.000000   
max              2.457169     0.726672     0.887245     1.202607     1.166480   

            Comedy        Crime  Documentary        Drama      Fantasy  ...  \
count  9742.000000  9742.000000  9742.000000  9742.000000  9742.000000  ...   
mean      0.159587     0.111978     0.060756     0.156257     0.087728  ...   
std       0.201476  

## Cosine 유사도

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    result_df = pd.DataFrame(data = cos_sim, index = [a.index])

    return result_df

In [None]:
cs_df = cos_sim_matrix(movie_representation, movie_representation)
cs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
1,1.0,0.124438,0.008403,0.040571,0.011755,0.0,0.016339,0.331122,0.0,0.131794,...,0.064466,0.260941,0.071492,0.27171,0.0,0.348295,0.379492,0.0,0.232553,0.093519
2,0.124438,1.0,0.0,0.0,0.0,0.0,0.0,0.240843,0.0,0.095861,...,0.0,0.0,0.0,0.0,0.0,0.108082,0.117763,0.0,0.0,0.0
3,0.008403,0.0,1.0,0.179391,0.011294,0.0,0.072246,0.0,0.0,0.0,...,0.00656,0.0,0.068686,0.0,0.0,0.020322,0.022142,0.0,0.0,0.089849
4,0.040571,0.0,0.179391,1.0,0.05453,0.0,0.348828,0.0,0.0,0.0,...,0.031674,0.101979,0.567487,0.0,0.0,0.098119,0.106908,0.365843,0.0,0.433821
5,0.011755,0.0,0.011294,0.05453,1.0,0.0,0.640342,0.0,0.0,0.0,...,0.009177,0.0,0.096091,0.0,0.0,0.028429,0.030976,0.0,0.0,0.125697


In [None]:
print(cs_df[1].sort_values(ascending = False))

2         1.000000
46972     0.322201
158813    0.300850
119655    0.300850
80748     0.300850
            ...   
4921      0.000000
4920      0.000000
4919      0.000000
4917      0.000000
193609    0.000000
Name: 1, Length: 9742, dtype: float64


In [None]:
for id in [2,46972, 158813, 119655, 80748]:
    print(movies_df.loc[id])

title                 Jumanji (1995)
genres    Adventure|Children|Fantasy
Name: 2, dtype: object
title     Night at the Museum (2006)
genres    Action|Comedy|Fantasy|IMAX
Name: 46972, dtype: object
title     Alice Through the Looking Glass (2016)
genres                Adventure|Children|Fantasy
Name: 158813, dtype: object
title             Seventh Son (2014)
genres    Adventure|Children|Fantasy
Name: 119655, dtype: object
title     Alice in Wonderland (1933)
genres    Adventure|Children|Fantasy
Name: 80748, dtype: object


Test

In [None]:
train_df, test_df = train_test_split(ratings_df, test_size = 0.2, random_state = 918)


In [None]:
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


In [None]:
test_userids = list(set(test_df.userId.values))
test_userids

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [None]:
result_df = pd.DataFrame()

for user_id in tqdm(test_userids):
    # user의 정보 추출
    user_record_df = train_df.loc[train_df.userId == int(user_id), :]

    # (n, 9742)
    user_sim_df = cs_df.loc[user_record_df['movieId']]
    
    user_rating_df = user_record_df[['rating']] # (n,1)
    sim_sum = np.sum(user_sim_df.T.to_numpy(), -1, keepdims=True) # (9742, 1)
    prediction = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()) / (sim_sum+1)

    prediction_df = pd.DataFrame(prediction, index = cs_df.index).reset_index()
    prediction_df.columns = ['movieId', 'pred_rating']
    prediction_df = prediction_df[['movieId', 'pred_rating']][prediction_df.movieId.isin(test_df[test_df.userId == user_id]['movieId'].values)]

    temp_df = prediction_df.merge(test_df[test_df.userId == user_id], on='movieId')
    result_df = pd.concat([result_df, temp_df], axis=0)

100%|██████████| 610/610 [00:20<00:00, 29.39it/s]


In [None]:
result_df.head(10)

In [None]:
mse = mean_squared_error(result_df['rating'].values, result_df['pred_rating'].values)
print(mse)

1.3935636206505984
