**컨텐츠 기반 추천 시스템**

평점에 대한 full matrix 생성  
피어슨 상관계수를 통해 각 영화별 유사도를 파악  

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

In [None]:
# about rating
path_rating = 'drive/MyDrive/Notebooks/ml-100k/u.data'
# about movie
path_movie = 'drive/MyDrive/Notebooks/ml-100k/u.item'
# about user
path_user = 'drive/MyDrive/Notebooks/ml-100k/u.user'

In [None]:
def parse(s):
  seq = s.split(' | ')
  return list(map(lambda s: s.replace(' ', '_'), seq))

parse('user id | age | gender | occupation | zip code')

['user_id', 'age', 'gender', 'occupation', 'zip_code']

In [None]:
user_col = parse('user id | age | gender | occupation | zip code')
users = pd.read_csv(path_user, sep='|', names=user_col)
users.head(5)

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [None]:
movie_col = parse("movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western")
movies = pd.read_csv(path_movie, sep='|', names=movie_col, encoding='ISO-8859-1')
movies = movies[['movie_id', 'movie_title']]
movies.head(5)

Unnamed: 0,movie_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [None]:
rating_col = parse('user id | movie id | rating | timestamp')
ratings = pd.read_csv(path_rating, sep='\t', names=rating_col, encoding='ISO-8859-1')
ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [None]:
display(users.head())
display(movies.head())
display(ratings.head())

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


Unnamed: 0,movie_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [None]:
# movies 테이블의 영화 정보에 평가 개수 열 추가
movies = movies.set_index("movie_id")
movies["#_of_ratings"] = pd.DataFrame(ratings.groupby("movie_id")["rating"].count())
display(movies)

Unnamed: 0_level_0,movie_title,#_of_ratings
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),452
2,GoldenEye (1995),131
3,Four Rooms (1995),90
4,Get Shorty (1995),209
5,Copycat (1995),86
...,...,...
1678,Mat' i syn (1997),1
1679,B. Monkey (1998),1
1680,Sliding Doors (1998),1
1681,You So Crazy (1994),1


In [None]:
# 평가 데이터(ratings)에 영화 정보(movies) 테이블의 컬럼 추가
ratings = pd.merge(ratings, movies, on="movie_id")
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,#_of_ratings
0,196,242,3,881250949,Kolya (1996),117
1,63,242,3,875747190,Kolya (1996),117
2,226,242,5,883888671,Kolya (1996),117
3,154,242,3,879138235,Kolya (1996),117
4,306,242,5,876503793,Kolya (1996),117
...,...,...,...,...,...,...
99995,840,1674,4,891211682,Mamma Roma (1962),1
99996,655,1640,3,888474646,"Eighth Day, The (1996)",1
99997,655,1637,3,888984255,Girls Town (1996),1
99998,655,1630,3,887428735,"Silence of the Palace, The (Saimt el Qusur) (1...",1


In [None]:
# full matrix 생성
rating_matrix = ratings.pivot_table(values="rating", index="user_id", columns="movie_title")
rating_matrix.head()

movie_title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",8 1/2 (1963),8 Heads in a Duffel Bag (1997),8 Seconds (1994),A Chef in Love (1996),Above the Rim (1994),Absolute Power (1997),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),Across the Sea of Time (1995),Addams Family Values (1993),Addicted to Love (1997),"Addiction, The (1995)","Adventures of Pinocchio, The (1996)","Adventures of Priscilla, Queen of the Desert, The (1994)","Adventures of Robin Hood, The (1938)","Affair to Remember, An (1957)","African Queen, The (1951)",Afterglow (1997),"Age of Innocence, The (1993)",Aiqing wansui (1994),Air Bud (1997),Air Force One (1997),"Air Up There, The (1994)",Airheads (1994),Akira (1988),Aladdin (1992),Aladdin and the King of Thieves (1996),Alaska (1996),Albino Alligator (1996),...,"Whole Wide World, The (1996)",Widows' Peak (1994),"Wife, The (1995)",Wild America (1997),Wild Bill (1995),"Wild Bunch, The (1969)",Wild Reeds (1994),Wild Things (1998),William Shakespeare's Romeo and Juliet (1996),Willy Wonka and the Chocolate Factory (1971),Window to Paris (1994),Wings of Courage (1995),Wings of Desire (1987),"Wings of the Dove, The (1997)",Winnie the Pooh and the Blustery Day (1968),"Winter Guest, The (1997)",Wishmaster (1997),With Honors (1994),Withnail and I (1987),Witness (1985),"Wizard of Oz, The (1939)",Wolf (1994),"Woman in Question, The (1950)","Women, The (1939)","Wonderful, Horrible Life of Leni Riefenstahl, The (1993)",Wonderland (1997),"Wooden Man's Bride, The (Wu Kui) (1994)","World of Apu, The (Apur Sansar) (1959)","Wrong Trousers, The (1993)",Wyatt Earp (1994),Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,,,2.0,5.0,,,3.0,4.0,,,,,,,,,3.0,3.0,,,,,,,,,,,,,,1.0,,,,4.0,4.0,,,,...,,,,,,,,,,4.0,,,,,,,,,,,4.0,,,,,,,,5.0,,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,,,,,,3.0,,,,,,,,,,,,,,,,,4.0,,,,,,,,...,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,
5,,,2.0,,,,,4.0,,,,,,,,,,,1.0,,2.0,,,,5.0,,,,,3.0,,,,,,,4.0,4.0,,,...,,,,,,,,,1.0,3.0,,,,,,,,,,,,,,,,,,,5.0,,,,,4.0,,,,,4.0,


In [None]:
toystory_ratings = rating_matrix["Toy Story (1995)"]
print(toystory_ratings)

user_id
1      5.0
2      4.0
3      NaN
4      NaN
5      4.0
      ... 
939    NaN
940    NaN
941    5.0
942    NaN
943    NaN
Name: Toy Story (1995), Length: 943, dtype: float64


In [None]:
# 전체 영화 대상으로 토이스토리와 상관계수를 계산 (평가 패턴 유사도)
# corrwith 모든 변수간의 상관관계, corr 두 변수간의 상관관계
corr_toystory = rating_matrix.corrwith(toystory_ratings)

# 상관계수가 1에 가까울수록 같은 방향, -1에 가까울수록 반대방향, 0이면 관계없음
corr_toystory = pd.DataFrame(corr_toystory, columns=["Correlation"])
display(corr_toystory)

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,Correlation
movie_title,Unnamed: 1_level_1
'Til There Was You (1997),0.534522
1-900 (1994),
101 Dalmatians (1996),0.232118
12 Angry Men (1957),0.334943
187 (1997),0.651857
...,...
Young Guns II (1990),0.146312
"Young Poisoner's Handbook, The (1995)",-0.026402
Zeus and Roxanne (1997),0.447914
unknown,0.440959


In [None]:
# 평가 수 추가
corr_toystory = pd.merge(corr_toystory, movies, on="movie_title")
display(corr_toystory)

Unnamed: 0,movie_title,Correlation,#_of_ratings
0,'Til There Was You (1997),0.534522,9
1,1-900 (1994),,5
2,101 Dalmatians (1996),0.232118,109
3,12 Angry Men (1957),0.334943,125
4,187 (1997),0.651857,41
...,...,...,...
1677,Young Guns II (1990),0.146312,44
1678,"Young Poisoner's Handbook, The (1995)",-0.026402,41
1679,Zeus and Roxanne (1997),0.447914,6
1680,unknown,0.440959,9


In [None]:
# 평가 수가 일정 수 이상인 것들만 필터링
corr_toystory = corr_toystory[corr_toystory["#_of_ratings"] > 50]
display(corr_toystory)

Unnamed: 0,movie_title,Correlation,#_of_ratings
2,101 Dalmatians (1996),0.232118,109
3,12 Angry Men (1957),0.334943,125
5,2 Days in the Valley (1996),0.162728,93
6,"20,000 Leagues Under the Sea (1954)",0.328472,72
7,2001: A Space Odyssey (1968),-0.069060,259
...,...,...,...
1662,"Wizard of Oz, The (1939)",0.352698,246
1663,Wolf (1994),0.303789,67
1670,"Wrong Trousers, The (1993)",0.188517,118
1675,Young Frankenstein (1974),0.239244,200


In [None]:
# 상관관계가 높은 5개 추천
corr_toystory = corr_toystory.sort_values(by="Correlation", ascending=False).head()
display(corr_toystory)

Unnamed: 0,movie_title,Correlation,#_of_ratings
1540,Toy Story (1995),1.0,452
1221,Raise the Red Lantern (1991),0.641535,58
543,Flubber (1997),0.558389,53
777,"Jackal, The (1997)",0.557876,87
351,"Craft, The (1996)",0.5491,104
