In [1]:
import pandas as pd
import numpy as np


In [2]:
url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"

ratings = pd.read_csv(
    url,
    sep='\t',
    names=['user_id', 'movie_id', 'rating', 'timestamp']
)

ratings.head()


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
user_item_matrix = ratings.pivot_table(
    index='user_id',
    columns='movie_id',
    values='rating'
)

user_item_matrix.head()


movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [4]:
user_item_matrix.fillna(0, inplace=True)


In [5]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarity = cosine_similarity(user_item_matrix)


In [6]:
def recommend_movies(user_id, n=5):
    user_index = user_id - 1
    similarity_scores = user_similarity[user_index]

    similar_users = np.argsort(similarity_scores)[::-1][1:]

    recommendations = user_item_matrix.iloc[similar_users].mean()
    recommendations = recommendations.sort_values(ascending=False)

    return recommendations.head(n)


In [7]:
recommend_movies(user_id=5)


Unnamed: 0_level_0,0
movie_id,Unnamed: 1_level_1
50,2.693206
100,2.235669
181,2.151805
258,2.055202
174,1.890658


In [8]:
from sklearn.metrics import mean_squared_error

actual = ratings['rating']
predicted = np.full_like(actual, actual.mean())

rmse = np.sqrt(mean_squared_error(actual, predicted))
print("RMSE:", rmse)


RMSE: 1.244138255982831
