In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load the dataset
rating = pd.read_csv("u.data", sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
rating.head()


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
#loading movies data
movie= pd.read_csv("u.item", sep="|", encoding="latin-1",
                     names=["item_id", "title", "release_date", "video_release_date", "IMDb_URL",
                            "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy",
                            "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
                            "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"])
movie.head()

Unnamed: 0,item_id,title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
#merging columns with specifying which id belong to which table to avoid conflicts
df = rating.merge(movie, left_on="movie_id", right_on="item_id")
user_matrix = df.pivot_table(index="user_id", columns="title", values="rating")

In [5]:
#computing similarty using pearson instead of cosine as cosine will fill the null reviews with zero which doesnt make sense in this case
user_similarity = user_matrix.T.corr()

In [6]:
# -----------------------------
def recommend_movies(user_id, k):

    # sorting similar users asc.
    similar_users = user_similarity[user_id].drop(user_id).dropna().sort_values(ascending=False)

    # ratings
    weighted_ratings = pd.Series(dtype=float)
    for sim_user, sim_score in similar_users.items():
        sim_user_ratings = user_matrix.loc[sim_user]
        weighted_ratings = weighted_ratings.add(sim_user_ratings * sim_score, fill_value=0)

    # removing already watched movies
    watched_movies = user_matrix.loc[user_id].dropna().index
    weighted_ratings = weighted_ratings.drop(index=watched_movies, errors="ignore")

    # Top n recommendations depending on the k gotten from the user
    recommendations = weighted_ratings.sort_values(ascending=False).head(k)
    return recommendations


In [7]:
#testing for a specific user
user_id = 6
k=5
recommendations = recommend_movies(user_id, k)

print(f"Top recommendations for User {user_id}:")
print(recommendations)

Top recommendations for User 6:
title
Return of the Jedi (1983)          401.581834
Scream (1996)                      326.979545
Titanic (1997)                     325.914674
Empire Strikes Back, The (1980)    284.463307
Air Force One (1997)               281.703917
dtype: float64
