In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Dataset 불러오기

In [3]:
path = '/content/drive/MyDrive/data/movielens/'

ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

## Genres를 이용한 movie representation

In [4]:
total_count = len(movies_df.index)
total_genres = list(set([genres for sublist in list(map(lambda x: x.split('|'), movies_df['genres'])) for genres in sublist]))

In [5]:
print(f'전체 영화 수: {total_count}')
print(f'장르: {total_genres}')

전체 영화 수: 9742
장르: ['Horror', 'Fantasy', 'Children', 'Documentary', 'War', 'Film-Noir', 'Mystery', 'Thriller', 'IMAX', 'Action', 'Sci-Fi', '(no genres listed)', 'Comedy', 'Romance', 'Western', 'Crime', 'Animation', 'Drama', 'Adventure', 'Musical']


In [6]:
print(len(total_genres))

20


In [7]:
genres_count  = dict.fromkeys(total_genres)

for each_genres_list in movies_df['genres']:
  for genres in each_genres_list.split('|'):
    if genres_count[genres] == None:
      genres_count[genres] = 1
    else:
      genres_count[genres] += 1

In [8]:
genres_count

{'Horror': 978,
 'Fantasy': 779,
 'Children': 664,
 'Documentary': 440,
 'War': 382,
 'Film-Noir': 87,
 'Mystery': 573,
 'Thriller': 1894,
 'IMAX': 158,
 'Action': 1828,
 'Sci-Fi': 980,
 '(no genres listed)': 34,
 'Comedy': 3756,
 'Romance': 1596,
 'Western': 167,
 'Crime': 1199,
 'Animation': 611,
 'Drama': 4361,
 'Adventure': 1263,
 'Musical': 334}

In [9]:
# 장르별 가중치 계산 IDF
for each_genres in genres_count:
  genres_count[each_genres] = np.log10(total_count/genres_count[each_genres])

In [10]:
genres_count

{'Horror': 0.9983092704481497,
 'Fantasy': 1.0971106675631868,
 'Children': 1.1664800458677336,
 'Documentary': 1.3451954487495636,
 'War': 1.4065847623240424,
 'Film-Noir': 2.0491288726171324,
 'Mystery': 1.2304935032683613,
 'Thriller': 0.7112681505684965,
 'IMAX': 1.7899910382813284,
 'Action': 0.7266719338379385,
 'Sci-Fi': 0.9974220495432562,
 '(no genres listed)': 2.457169208193496,
 'Comedy': 0.41392254164167785,
 'Romance': 0.7856152382210405,
 'Western': 1.7659316540881678,
 'Crime': 0.9098289421369025,
 'Animation': 1.2026069149931968,
 'Drama': 0.3490620385623247,
 'Adventure': 0.8872447746804204,
 'Musical': 1.4649016584241867}

In [16]:
dict_temp = {i: genres_count[i] for i in movies_df.iloc[0,:]['genres'].split('|')}
dict_temp

{'Adventure': 0.8872447746804204,
 'Animation': 1.2026069149931968,
 'Children': 1.1664800458677336,
 'Comedy': 0.41392254164167785,
 'Fantasy': 1.0971106675631868}

In [11]:
from operator import ge
# create genres representations
genres_representation = pd.DataFrame(columns=sorted(total_genres), index=movies_df.index)
for index, each_row in tqdm(movies_df.iterrows()):
  dict_temp = {i: genres_count[i] for i in each_row['genres'].split('|')}
  row_to_add = pd.DataFrame(dict_temp, index=[index])
  genres_representation.update(row_to_add)

genres_representation

9742it [00:58, 166.23it/s]


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,0.887245,1.202607,1.16648,0.413923,,,,1.097111,,,,,,,,,,
2,,,0.887245,,1.16648,,,,,1.097111,,,,,,,,,,
3,,,,,,0.413923,,,,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
5,,,,,,0.413923,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,0.726672,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193583,,,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193585,,,,,,,,,0.349062,,,,,,,,,,,
193587,,0.726672,,1.202607,,,,,,,,,,,,,,,,


## Tag를 이용한 Movie Representation

In [17]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [18]:
# get unique tag
tag_column = list(map(lambda x: x.split(','), tags_df['tag']))
unique_tags = list(set(list(map(lambda x: x.strip(), list([tag for sublist in tag_column for tag in sublist])))))

print(unique_tags)

['entirely dialogue', 'end of the world', 'Girl Power', 'best comedy', 'invisibility', 'Wall Street', 'Horrible directing', 'drama', 'Hammett', 'love story', 'Emilia Clarke', 'awesome', 'family', 'lawyer', 'androids', 'women', 'organised crime', 'six-fingered man', 'political right versus left', 'violence', 'alcoholism', 'rug', 'fatherhood', 'funny', 'melancholy', 'Adrien Brody', 'rabbi', 'Rita Hayworth can dance!', 'Vietnam', 'oldie but goodie', 'road trip', 'wizards', 'gritty', 'Olympics', 'unpredictable', 'silly', 'blind', 'Jaime Pressly', 'David Thewlis', 'Julianne Moore', 'Homeless', 'unusual', 'James Cameron', 'vampires', 'space craft', 'Johnny Depp', 'contemplative', 'not seen', 'schizophrenia', 'whales', 'Lolita theme', 'heartbreaking', 'beat poetry', 'Deep Throat', 'artistic', 'Andrew Lloyd Weber', 'building a family', 'mice', 'sentimental', 'teenagers', 'biopic', 'alternate endings', 'virginity', 'future', '1970s', 'George Bernard Shaw', 'the catholic church is the most corru

In [19]:
print(len(tag_column))
print(len(unique_tags))

3683
1589


In [22]:
# Compute IDF for tag
total_movie_count = len(set(tags_df['movieId']))
# key : tag, value: number of movies with such tag
tag_count_dict = dict.fromkeys(unique_tags)

for each_movie_tag_list in tags_df['tag']:
  for tag in each_movie_tag_list.split(','):
    if tag_count_dict[tag.strip()] == None:
      tag_count_dict[tag.strip()] = 1
    else:
      tag_count_dict[tag.strip()] += 1

tag_idf = dict()
for each_tag in tag_count_dict:
  tag_idf[each_tag] = np.log10(total_movie_count/tag_count_dict[each_tag])

tag_idf

{'entirely dialogue': 3.196452541703389,
 'end of the world': 3.196452541703389,
 'Girl Power': 2.7193312869837265,
 'best comedy': 3.196452541703389,
 'invisibility': 3.196452541703389,
 'Wall Street': 2.895422546039408,
 'Horrible directing': 3.196452541703389,
 'drama': 2.7193312869837265,
 'Hammett': 3.196452541703389,
 'love story': 2.7193312869837265,
 'Emilia Clarke': 3.196452541703389,
 'awesome': 2.895422546039408,
 'family': 2.351354501689132,
 'lawyer': 2.895422546039408,
 'androids': 3.196452541703389,
 'women': 3.196452541703389,
 'organised crime': 3.196452541703389,
 'six-fingered man': 3.196452541703389,
 'political right versus left': 3.196452541703389,
 'violence': 2.2933625547114453,
 'alcoholism': 2.4974825373673704,
 'rug': 3.196452541703389,
 'fatherhood': 2.5943925503754266,
 'funny': 1.8347247056857963,
 'melancholy': 2.5943925503754266,
 'Adrien Brody': 3.196452541703389,
 'rabbi': 3.196452541703389,
 'Rita Hayworth can dance!': 3.196452541703389,
 'Vietnam': 2

In [24]:
len(tag_idf.keys())

1589

In [None]:
# Create movie representations
tag_representation = pd.DataFrame(columns=sorted(unique_tags), index=list(set(tags_df['movieId'])))

for name, group in tqdm(tags_df.groupby(by='movieId')):
  temp_list = list(map(lambda x: x.split(','), list(group['tag'])))
  