In [None]:
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
movies = pd.read_csv("movies_small.csv")
ratings = pd.read_csv("ratings_small.csv")

In [None]:
# Gộp hai dataframe ratings và movies dựa trên movieId
data = pd.merge(ratings, movies, on='movieId')

# Tạo ma trận người dùng-phim
user_item_matrix = data.pivot_table(index='userId', columns='title', values='rating')
user_item_matrix.fillna(0, inplace=True)

# SVD và tính toán độ tương đồng cosine
svd = TruncatedSVD(n_components=50)
matrix_svd = svd.fit_transform(user_item_matrix)
user_similarity = cosine_similarity(matrix_svd)

def find_similar_users(user_id, user_similarity_matrix, k=10):
    user_idx = user_id - 1
    similarity_scores = list(enumerate(user_similarity_matrix[user_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similar_users = [i[0] for i in similarity_scores[1:k+1]]
    return similar_users

def recommend_movies(user_id, user_item_matrix, movies, user_similarity_matrix, k=10):
    similar_users = find_similar_users(user_id, user_similarity_matrix, k)
    similar_users_ratings = user_item_matrix.iloc[similar_users].mean(axis=0)
    user_ratings = user_item_matrix.iloc[user_id - 1]
    unrated_movies = user_ratings[user_ratings == 0]
    recommendations = similar_users_ratings[unrated_movies.index].sort_values(ascending=False).head(10)

    # Lấy thông tin phim từ danh sách phim được gợi ý
    recommended_titles = recommendations.index
    recommended_movies = movies[movies['title'].isin(recommended_titles)]

    return recommended_movies[['movieId', 'title', 'genres']]

user_id = 1
recommended_movies = recommend_movies(user_id, user_item_matrix, movies, user_similarity, k=5)
recommended_movies

Unnamed: 0,movieId,title,genres
31,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
507,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi
793,1036,Die Hard (1988),Action|Crime|Thriller
902,1200,Aliens (1986),Action|Adventure|Horror|Sci-Fi
916,1215,Army of Darkness (1993),Action|Adventure|Comedy|Fantasy|Horror
1067,1387,Jaws (1975),Action|Horror
1211,1610,"Hunt for Red October, The (1990)",Action|Adventure|Thriller
1404,1923,There's Something About Mary (1998),Comedy|Romance
2078,2762,"Sixth Sense, The (1999)",Drama|Horror|Mystery
2393,3175,Galaxy Quest (1999),Adventure|Comedy|Sci-Fi


In [None]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357247 sha256=abaf23cb7b5c77fcbb4a39f5f1aff7c130a11dea551471f0a27b6cc3ea174147
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully inst

In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2)

# Use SVD with regularization
algo = SVD(n_factors=100, reg_all=0.1)
algo.fit(trainset)

predictions = algo.test(testset)

rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

def recommend_movies(algo, user_id, movie_df, ratings_df, n=10):
    all_movie_ids = movie_df['movieId'].unique()
    rated_movie_ids = ratings_df[ratings_df['userId'] == user_id]['movieId'].unique()
    unrated_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movie_ids]
    predictions = [algo.predict(user_id, movie_id) for movie_id in unrated_movie_ids]
    top_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    top_movie_ids = [pred.iid for pred in top_predictions]
    recommended_movies = movie_df[movie_df['movieId'].isin(top_movie_ids)]

    return recommended_movies


user_id = 1
recommended_movies = recommend_movies(algo, user_id, movies, ratings, n=10)
recommended_movies

RMSE: 0.8748
MAE:  0.6736


Unnamed: 0,movieId,title,genres
277,318,"Shawshank Redemption, The (1994)",Crime|Drama
602,750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War
686,904,Rear Window (1954),Mystery|Thriller
694,912,Casablanca (1942),Drama|Romance
841,1104,"Streetcar Named Desire, A (1951)",Drama
906,1204,Lawrence of Arabia (1962),Adventure|Drama|War
949,1250,"Bridge on the River Kwai, The (1957)",Adventure|Drama|War
2462,3275,"Boondock Saints, The (2000)",Action|Crime|Drama|Thriller
4909,7361,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi
6648,56782,There Will Be Blood (2007),Drama|Western


In [None]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

# Đọc dữ liệu từ các file CSV
movies = pd.read_csv("movies_small.csv")
ratings = pd.read_csv("ratings_small.csv")

# Tạo ma trận người dùng-phim
ratings_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Chuyển đổi ma trận thành dạng thưa (sparse matrix)
ratings_matrix_sparse = csr_matrix(ratings_matrix.values)

# Hàm huấn luyện mô hình KNN
def train_knn_model(ratings_matrix_sparse):
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
    model_knn.fit(ratings_matrix_sparse)
    return model_knn

# Huấn luyện mô hình
knn_model = train_knn_model(ratings_matrix_sparse)

# Hàm đề xuất phim
def recommend_movies(user_id, ratings_matrix, knn_model, movies, ratings, n_recommendations=10):
    # Tìm các phim mà người dùng đã xem
    user_ratings = ratings_matrix.loc[user_id].values.reshape(1, -1)

    # Tìm các phim tương tự dựa trên các phim đã xem
    distances, indices = knn_model.kneighbors(user_ratings, n_neighbors=n_recommendations + len(user_ratings[user_ratings > 0]))

    # Lấy danh sách phim đã xem
    watched_movie_ids = ratings[ratings['userId'] == user_id]['movieId'].tolist()

    # Gợi ý các phim dựa trên các phim tương tự mà người dùng chưa xem
    recommended_movie_ids = []
    for index in indices.flatten():
        movie_id = ratings_matrix.columns[index]
        if movie_id not in watched_movie_ids:
            recommended_movie_ids.append(movie_id)
            if len(recommended_movie_ids) >= n_recommendations:
                break

    # Lấy tiêu đề phim từ danh sách phim được gợi ý
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)]
   #recommended_movies['similarity_score'] = similarity_scores
    return recommended_movies

# Ví dụ sử dụng
user_id = 1  # Thay bằng user_id mà bạn muốn gợi ý
recommended_movies = recommend_movies(user_id, ratings_matrix, knn_model, movies, ratings)
recommended_movies


Unnamed: 0,movieId,title,genres
18,19,Ace Ventura: When Nature Calls (1995),Comedy
38,42,Dead Presidents (1995),Action|Crime|Drama
44,48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance
56,63,Don't Be a Menace to South Central While Drink...,Comedy|Crime
90,102,Mr. Wrong (1996),Comedy
265,305,Ready to Wear (Pret-A-Porter) (1994),Comedy
287,329,Star Trek: Generations (1994),Adventure|Drama|Sci-Fi
312,354,Cobb (1994),Drama
451,516,Renaissance Man (1994),Comedy|Drama
468,535,Short Cuts (1993),Drama


In [None]:
# Extract predictions
pred_df = pd.DataFrame([(pred.uid, pred.iid, pred.r_ui, pred.est) for pred in predictions],
                       columns=['userId', 'movieId', 'actual_rating', 'predicted_rating'])

# Merge with movie titles (if needed)
movies = pd.read_csv('movies_small.csv')
pred_df = pd.merge(pred_df, movies, on='movieId')

# Display the predictions vs actual values
pred_df.head(10)


Unnamed: 0,userId,movieId,actual_rating,predicted_rating,title,genres
0,474,1513,3.0,2.799018,Romy and Michele's High School Reunion (1997),Comedy
1,313,1513,4.0,2.705302,Romy and Michele's High School Reunion (1997),Comedy
2,95,1513,3.0,3.415920,Romy and Michele's High School Reunion (1997),Comedy
3,596,1513,3.5,2.873666,Romy and Michele's High School Reunion (1997),Comedy
4,409,1513,4.0,3.140233,Romy and Michele's High School Reunion (1997),Comedy
...,...,...,...,...,...,...
20163,414,5880,1.5,3.338862,Extreme Ops (2002),Action|Adventure|Crime|Thriller
20164,318,90630,3.5,3.600124,Miss Representation (2011),Documentary
20165,380,55232,3.0,3.364311,Resident Evil: Extinction (2007),Action|Horror|Sci-Fi|Thriller
20166,67,27193,5.0,3.579361,Taxi 2 (2000),Action|Comedy


In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load data into Surprise's format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split the data into training and testing
trainset, testset = train_test_split(data, test_size=0.2)

# Use SVD with regularization
algo = SVD(n_factors=100, reg_all=0.1)
algo.fit(trainset)

# Make predictions
predictions = algo.test(testset)

# Calculate RMSE and MAE
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)
# Extract predictions
pred_df = pd.DataFrame([(pred.uid, pred.iid, pred.r_ui, pred.est) for pred in predictions],
                       columns=['userId', 'movieId', 'actual_rating', 'predicted_rating'])

# Merge with movie titles
movies = pd.read_csv('movies_small.csv')
movie_id_to_filter = 1196
filtered_predictions = pred_df[pred_df['movieId'] == movie_id_to_filter]
movie_title_to_filter = "Toy Story"
#filtered_predictions = pred_df[pred_df['title'] == movie_title_to_filter]
pred_df = pd.merge(filtered_predictions, movies, on='movieId')

# Display the predictions vs actual values
pred_df


RMSE: 0.8857
MAE:  0.6825


Unnamed: 0,userId,movieId,actual_rating,predicted_rating,title,genres
0,39,1196,5.0,4.305388,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
1,215,1196,4.5,4.18801,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
2,186,1196,5.0,4.589631,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
3,28,1196,4.0,3.60927,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
4,82,1196,4.0,3.986403,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
5,256,1196,4.0,4.44717,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
6,32,1196,4.0,4.21346,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
7,112,1196,5.0,3.979018,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
8,64,1196,3.5,4.228626,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
9,166,1196,4.5,4.397221,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi


In [None]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
import pandas as pd

# Load data into Surprise's format
reader = Reader(rating_scale=(1, 5))
ratings = pd.read_csv('ratings_small.csv')
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split the data into training and testing
trainset, testset = train_test_split(data, test_size=0.2)

# Use KNNBasic
algo = KNNBasic()
algo.fit(trainset)

# Make predictions
predictions = algo.test(testset)

# Calculate RMSE and MAE
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)
#print(f'KNNBasic Model - RMSE: {rmse:.4f}, MAE: {mae:.4f}')

# Extract predictions
pred_df = pd.DataFrame([(pred.uid, pred.iid, pred.r_ui, pred.est) for pred in predictions],
                       columns=['userId', 'movieId', 'actual_rating', 'predicted_rating'])

# Merge with movie titles
movies = pd.read_csv('movies_small.csv')
movie_id_to_filter = 1196
filtered_predictions = pred_df[pred_df['movieId'] == movie_id_to_filter]
# Uncomment the next line if you want to filter by movie title
# movie_title_to_filter = "Toy Story"
# filtered_predictions = pred_df[pred_df['title'] == movie_title_to_filter]
pred_df = pd.merge(filtered_predictions, movies, on='movieId')

# Display the predictions vs actual values
pred_df


Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9337
MAE:  0.7130


Unnamed: 0,userId,movieId,actual_rating,predicted_rating,title,genres
0,368,1196,3.0,4.279831,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
1,593,1196,5.0,4.034725,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
2,312,1196,5.0,4.316298,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
3,122,1196,5.0,4.461797,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
4,334,1196,4.0,4.07453,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
5,305,1196,5.0,4.428147,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
6,290,1196,5.0,4.378999,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
7,370,1196,2.5,3.718273,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
8,140,1196,3.0,4.1376,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
9,580,1196,4.0,4.249955,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
