### Import thư viện

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Bước 1: Đọc dữ liệu từ hai file CSV
ratings_df = pd.read_csv('ratings.csv')  # Dữ liệu đánh giá phim của người dùng
movies_df = pd.read_csv('movies.csv')    # Dữ liệu thông tin phim

In [6]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
movies_df.head(20)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Bước 2: Xử lý cột 'genres' (thể loại phim) thành văn bản có thể vector hóa
# Thay ký tự '|' bằng khoảng trắng để mỗi thể loại là một từ
movies_df['processed_genres'] = movies_df['genres'].str.replace('|', ' ', regex=False)


In [None]:
movies_df.head()

Unnamed: 0,movieId,title,genres,processed_genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,Comedy


In [13]:
# Bước 3: Vector hóa thể loại bằng TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movies_df['processed_genres'])
print(tfidf_matrix)

  (0, 1)	0.41684567364693936
  (0, 2)	0.5162254711770092
  (0, 3)	0.5048454681396087
  (0, 4)	0.26758647689140014
  (0, 8)	0.482990142708577
  (1, 1)	0.5123612074824269
  (1, 3)	0.6205251727456431
  (1, 8)	0.5936619434123594
  (2, 4)	0.5709154064399099
  (2, 19)	0.8210088907493954
  (3, 4)	0.5050154397005037
  (3, 19)	0.726240982959826
  (3, 7)	0.46640480307738325
  (4, 4)	1.0
  (5, 0)	0.5493281743985543
  (5, 5)	0.6359470441562757
  (5, 21)	0.5420423542868653
  (6, 4)	0.5709154064399099
  (6, 19)	0.8210088907493954
  (7, 1)	0.6366993258087036
  (7, 3)	0.7711121633813997
  (8, 0)	1.0
  (9, 1)	0.6295217016667962
  (9, 0)	0.5530653284926609
  (9, 21)	0.5457299419583338
  :	:
  (9731, 0)	0.41272965170024634
  (9731, 20)	0.508925697730817
  (9731, 9)	0.508925697730817
  (9732, 2)	0.5502833875552382
  (9732, 4)	0.28524046407869114
  (9732, 0)	0.39038039438445316
  (9732, 20)	0.48136743695078876
  (9732, 9)	0.48136743695078876
  (9733, 2)	0.9019723811117537
  (9733, 7)	0.43179372819853856
  

In [14]:
# Bước 4: Kết hợp bảng đánh giá và bảng phim theo movieId
ratings_movies_df = pd.merge(ratings_df, movies_df, on='movieId')
ratings_movies_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,processed_genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,Action Crime Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,Mystery Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,Crime Mystery Thriller
...,...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller,Drama Horror Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller,Action Crime Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror,Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi,Action Sci-Fi


In [16]:
# Bước 5: Chọn một người dùng cụ thể để gợi ý (ví dụ: người dùng có userId = 1)
user_id = 1
user_ratings = ratings_movies_df[ratings_movies_df['userId'] == user_id]
user_ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,processed_genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,Action Crime Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,Mystery Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,Crime Mystery Thriller
...,...,...,...,...,...,...,...
227,1,3744,4.0,964980694,Shaft (2000),Action|Crime|Thriller,Action Crime Thriller
228,1,3793,5.0,964981855,X-Men (2000),Action|Adventure|Sci-Fi,Action Adventure Sci-Fi
229,1,3809,4.0,964981220,What About Bob? (1991),Comedy,Comedy
230,1,4006,4.0,964982903,Transformers: The Movie (1986),Adventure|Animation|Children|Sci-Fi,Adventure Animation Children Sci-Fi


In [18]:
# Bước 6: Lọc các phim mà người dùng đánh giá từ 4.0 trở lên (xem như người dùng thích)
liked_movies = user_ratings[user_ratings['rating'] >= 4.0]

# Bước 7: Lấy chỉ số các phim mà người dùng này thích
liked_movie_indices = movies_df[movies_df['movieId'].isin(liked_movies['movieId'])].index
liked_movie_indices

Index([   0,    2,    5,   43,   46,   89,   97,  124,  130,  136,
       ...
       2733, 2764, 2765, 2788, 2798, 2802, 2836, 2847, 2991, 3673],
      dtype='int64', length=200)

In [25]:
# Bước 8: Tính hồ sơ người dùng bằng trung bình TF-IDF của các phim đã thích
user_profile = np.asarray(tfidf_matrix[liked_movie_indices].mean(axis=0))
user_profile

array([[0.18861407, 0.19786614, 0.07235146, 0.09884635, 0.17273064,
        0.12285831, 0.        , 0.12941243, 0.12326354, 0.08396007,
        0.00290702, 0.        , 0.03234403, 0.        , 0.        ,
        0.06375878, 0.04341391, 0.        , 0.00290702, 0.07236682,
        0.08396007, 0.10899999, 0.08006397, 0.02486895]])

In [26]:
# Bước 9: Tính độ tương đồng cosine giữa hồ sơ người dùng và tất cả các phim
similarities = cosine_similarity(user_profile, tfidf_matrix).flatten()
similarities

array([0.59851505, 0.51249327, 0.343328  , ..., 0.2811578 , 0.36530396,
       0.3752697 ])

In [27]:
# Bước 10: Gán điểm tương đồng vào bảng phim
movies_df['similarity'] = similarities
movies_df

Unnamed: 0,movieId,title,genres,processed_genres,similarity
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy,0.598515
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy,0.512493
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance,0.343328
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy Drama Romance,0.434831
4,5,Father of the Bride Part II (1995),Comedy,Comedy,0.375270
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Action Animation Comedy Fantasy,0.548823
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,Animation Comedy Fantasy,0.411311
9739,193585,Flint (2017),Drama,Drama,0.281158
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Action Animation,0.365304


In [29]:
# Bước 11: Loại bỏ các phim mà người dùng đã xem để không gợi ý lại
recommended_df = movies_df[~movies_df['movieId'].isin(liked_movies['movieId'])]
recommended_df

Unnamed: 0,movieId,title,genres,processed_genres,similarity
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy,0.512493
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy Drama Romance,0.434831
4,5,Father of the Bride Part II (1995),Comedy,Comedy,0.375270
6,7,Sabrina (1995),Comedy|Romance,Comedy Romance,0.343328
7,8,Tom and Huck (1995),Adventure|Children,Adventure Children,0.439300
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Action Animation Comedy Fantasy,0.548823
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,Animation Comedy Fantasy,0.411311
9739,193585,Flint (2017),Drama,Drama,0.281158
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Action Animation,0.365304


In [31]:
# Bước 12: Sắp xếp và chọn ra 10 phim có độ tương đồng cao nhất
top_recommendations = recommended_df.sort_values(by='similarity', ascending=False).head(10)
top_recommendations

Unnamed: 0,movieId,title,genres,processed_genres,similarity
8597,117646,Dragonheart 2: A New Beginning (2000),Action|Adventure|Comedy|Drama|Fantasy|Thriller,Action Adventure Comedy Drama Fantasy Thriller,0.796171
4681,6990,The Great Train Robbery (1978),Action|Adventure|Comedy|Crime|Drama,Action Adventure Comedy Crime Drama,0.777466
4005,5657,Flashback (1990),Action|Adventure|Comedy|Crime|Drama,Action Adventure Comedy Crime Drama,0.777466
6570,55116,"Hunting Party, The (2007)",Action|Adventure|Comedy|Drama|Thriller,Action Adventure Comedy Drama Thriller,0.773731
3608,4956,"Stunt Man, The (1980)",Action|Adventure|Comedy|Drama|Romance|Thriller,Action Adventure Comedy Drama Romance Thriller,0.760834
5471,26184,"Diamond Arm, The (Brilliantovaya ruka) (1968)",Action|Adventure|Comedy|Crime|Thriller,Action Adventure Comedy Crime Thriller,0.75663
7409,80219,Machete (2010),Action|Adventure|Comedy|Crime|Thriller,Action Adventure Comedy Crime Thriller,0.75663
4409,6503,Charlie's Angels: Full Throttle (2003),Action|Adventure|Comedy|Crime|Thriller,Action Adventure Comedy Crime Thriller,0.75663
5379,8968,After the Sunset (2004),Action|Adventure|Comedy|Crime|Thriller,Action Adventure Comedy Crime Thriller,0.75663
6774,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy,Action Adventure Comedy Crime Fantasy,0.755073


In [32]:
# Bước 13: In ra danh sách 10 phim được gợi ý
print(top_recommendations[['title', 'genres', 'similarity']])

                                              title  \
8597          Dragonheart 2: A New Beginning (2000)   
4681                 The Great Train Robbery (1978)   
4005                               Flashback (1990)   
6570                      Hunting Party, The (2007)   
3608                          Stunt Man, The (1980)   
5471  Diamond Arm, The (Brilliantovaya ruka) (1968)   
7409                                 Machete (2010)   
4409         Charlie's Angels: Full Throttle (2003)   
5379                        After the Sunset (2004)   
6774                                 Hancock (2008)   

                                              genres  similarity  
8597  Action|Adventure|Comedy|Drama|Fantasy|Thriller    0.796171  
4681             Action|Adventure|Comedy|Crime|Drama    0.777466  
4005             Action|Adventure|Comedy|Crime|Drama    0.777466  
6570          Action|Adventure|Comedy|Drama|Thriller    0.773731  
3608  Action|Adventure|Comedy|Drama|Romance|Thriller    0.7