<a href="https://colab.research.google.com/github/moch1996/Vaycold/blob/main/Recommend_Algorithm/TF-IDF%EB%A5%BC_%ED%99%9C%EC%9A%A9%ED%95%9C_%EC%B6%94%EC%B2%9C_%EC%95%8C%EA%B3%A0%EB%A6%AC%EC%A6%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TF-IDF를 활용한 추천 알고리즘
 - 컨텐츠 기반 추천시스템 
 - 추천 성능은 RMSE로 판단

In [44]:
import os
import numpy as np
import pandas as pd
import math
from tqdm import tqdm
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

### 데이터 불러오기

In [4]:
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
path = '/content/drive/MyDrive/fastcampus/recommend/movielens'
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [7]:
# 장르와 태그를 이용할 예정임

In [6]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [9]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [16]:
# 무비데이터와 태그데이터를 적절히 활용하여 각 무비마다 피쳐를 만들어 사용

In [18]:
# genre를 이용한 movie representation
total_count = len(movies_df.index) # 영화는 총 몇개 있는 지 
total_genres = list(set([genre for sublist in list(map(lambda x: x.split('|'), movies_df['genres'])) for genre in sublist])) # 장르는 총 몇개 있는 지

In [19]:
print(f'전체 영화 수: {total_count}')
print(f'전체 장르: {total_genres}')


전체 영화 수: 9742
전체 장르: ['Mystery', 'Drama', 'Musical', 'Crime', 'Western', 'Action', 'Film-Noir', 'Children', '(no genres listed)', 'Animation', 'Horror', 'Adventure', 'Comedy', 'Romance', 'Fantasy', 'Thriller', 'War', 'Sci-Fi', 'IMAX', 'Documentary']


In [21]:
print(len(total_genres))

20


In [31]:
# movie df에서 각각의 장르를 가져오는데 movie df에서 각각의 장르를 가져오는데 총 몇번이 등장하는 지 보여줌 

genre_count = dict.fromkeys(total_genres)

for each_genre_list in movies_df['genres'] :
    for genre in each_genre_list.split('|') :
        if genre_count[genre] == None :
            genre_count[genre] = 1
        else :
            genre_count[genre] = genre_count[genre] + 1

In [33]:
genre_count

{'(no genres listed)': 34,
 'Action': 1828,
 'Adventure': 1263,
 'Animation': 611,
 'Children': 664,
 'Comedy': 3756,
 'Crime': 1199,
 'Documentary': 440,
 'Drama': 4361,
 'Fantasy': 779,
 'Film-Noir': 87,
 'Horror': 978,
 'IMAX': 158,
 'Musical': 334,
 'Mystery': 573,
 'Romance': 1596,
 'Sci-Fi': 980,
 'Thriller': 1894,
 'War': 382,
 'Western': 167}

In [36]:
# 장르별 가중치 계산
for each_genre in genre_count :
  genre_count[each_genre] = np.log10(total_count / genre_count[each_genre])

In [37]:
genre_count

{'(no genres listed)': 2.457169208193496,
 'Action': 0.7266719338379385,
 'Adventure': 0.8872447746804204,
 'Animation': 1.2026069149931968,
 'Children': 1.1664800458677336,
 'Comedy': 0.41392254164167785,
 'Crime': 0.9098289421369025,
 'Documentary': 1.3451954487495636,
 'Drama': 0.3490620385623247,
 'Fantasy': 1.0971106675631868,
 'Film-Noir': 2.0491288726171324,
 'Horror': 0.9983092704481497,
 'IMAX': 1.7899910382813284,
 'Musical': 1.4649016584241867,
 'Mystery': 1.2304935032683613,
 'Romance': 0.7856152382210405,
 'Sci-Fi': 0.9974220495432562,
 'Thriller': 0.7112681505684965,
 'War': 1.4065847623240424,
 'Western': 1.7659316540881678}

In [40]:
# create genre representations

genre_representation = pd.DataFrame(
    columns = sorted(total_genres),
    index = movies_df.index
    )

In [42]:
genre_representation.head()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,,,,,


In [45]:
for index, each_row in tqdm(movies_df.iterrows()) :
    dict_temp = {i : genre_count[i] for i in each_row['genres'].split('|')}
    row_to_add = pd.DataFrame(dict_temp, index =[index]) # 해당되는 가중치를 가져옴
    genre_representation.update(row_to_add)

genre_representation


9742it [00:51, 188.45it/s]


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,0.887245,1.20261,1.16648,0.413923,,,,1.09711,,,,,,,,,,
2,,,0.887245,,1.16648,,,,,1.09711,,,,,,,,,,
3,,,,,,0.413923,,,,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
5,,,,,,0.413923,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,0.726672,,1.20261,,0.413923,,,,1.09711,,,,,,,,,,
193583,,,,1.20261,,0.413923,,,,1.09711,,,,,,,,,,
193585,,,,,,,,,0.349062,,,,,,,,,,,
193587,,0.726672,,1.20261,,,,,,,,,,,,,,,,


# Tag를 이용한 movie representation

In [46]:
tags_df.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [47]:
movies_df.loc[60756]

title     Step Brothers (2008)
genres                  Comedy
Name: 60756, dtype: object

In [48]:
movies_df.loc[89774]

title     Warrior (2011)
genres             Drama
Name: 89774, dtype: object

In [49]:
tags_df['tag']

0                  funny
1        Highly quotable
2           will ferrell
3           Boxing story
4                    MMA
              ...       
3678           for katie
3679             austere
3680              gun fu
3681    heroic bloodshed
3682    Heroic Bloodshed
Name: tag, Length: 3683, dtype: object

In [None]:
# get unique tag
tag_column = list(map(lambda x : x.split(','), tags_df['tag'])) #map함수의 활용
unique_tags = list(set(list(map(lambda x: x.strip(), list([tag for sublist in tag_column for tag in sublist])))))

In [None]:
# get unique tag
tag_column = list(map(
    lambda x: x.split(','), 
    tags_df['tag']))

unique_tags = list(set(list(
    map(
    lambda x: x.strip(), 
    list([tag for sublist in tag_column 
              for tag in sublist]))))) #list comprehension 내 이중 for문이라 해석하기 난해하다. 
                                      # 해석을해보자면 tag를 리스트로 바꾼 후 그것을 x.strip()함수로 먹인 것을 리스트로 하고 다시 셋을 만들어서 리스트를 먹인다는 말인데...
                                      # 두번이나 먹여야 되나..?

print(unique_tags)