In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy


In [3]:
movies = pd.read_csv("ml-100k/u.item", sep='|', encoding='latin-1', header=None)
movies.columns = ['movie_id', 'movie_title', 'release_date', 'video_release', 'IMDb_URL'] + [f'genre_{i}' for i in range(19)]
movies = movies[['movie_id', 'movie_title']]

ratings = pd.read_csv("ml-100k/u.data", sep='\t', header=None)
ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

movie_ratings = pd.merge(ratings, movies, on='movie_id')
movie_ratings.head()


Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [5]:
user_movie_matrix = movie_ratings.pivot_table(index='user_id', columns='movie_title', values='rating')

target_movie = 'Star Wars (1977)'
if target_movie in user_movie_matrix.columns:
    movie_ratings_series = user_movie_matrix[target_movie]
    ...
else:
    print(f"{target_movie} not found in data.")

movie_counts = user_movie_matrix.count()
user_movie_matrix = user_movie_matrix.loc[:, movie_counts[movie_counts >= 2].index]

movie_ratings_series = user_movie_matrix['Star Wars (1977)']
similar_movies = user_movie_matrix.corrwith(movie_ratings_series)

corr_df = pd.DataFrame(similar_movies, columns=['correlation'])

corr_df.dropna(inplace=True)

ratings_summary = movie_ratings.groupby('movie_title')['rating'].agg(['mean', 'count'])
corr_df = corr_df.join(ratings_summary)

recommended_movies = corr_df[corr_df['count'] > 100].sort_values('correlation', ascending=False)

recommended_movies.head(10)


  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,correlation,mean,count
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Star Wars (1977),1.0,4.358491,583
"Empire Strikes Back, The (1980)",0.747981,4.20436,367
Return of the Jedi (1983),0.672556,4.00789,507
Raiders of the Lost Ark (1981),0.536117,4.252381,420
Austin Powers: International Man of Mystery (1997),0.377433,3.246154,130
"Sting, The (1973)",0.367538,4.058091,241
Indiana Jones and the Last Crusade (1989),0.350107,3.930514,331
Pinocchio (1940),0.347868,3.673267,101
"Frighteners, The (1996)",0.332729,3.234783,115
L.A. Confidential (1997),0.319065,4.161616,297


In [7]:
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file('ml-100k/u.data', reader=reader)

trainset, testset = train_test_split(data, test_size=0.2)

model = SVD()
model.fit(trainset)

predictions = model.test(testset)
print("RMSE:", accuracy.rmse(predictions))


RMSE: 0.9375
RMSE: 0.9374527503016737


In [9]:
uid = str(196)
iid = str(302)
pred = model.predict(uid, iid)
print(f"Predicted rating of user {uid} for movie {iid}: {pred.est}")


Predicted rating of user 196 for movie 302: 4.257870102076963
