In [81]:
# We will be using a MovieLens dataset. This dataset contains 100004 ratings across 1682 movies for 943 users. All selected users had at least rated 20 movies. We are going to build a recommendation engine which will suggest movies for a user which he hasn't watched yet based on the movies which he has already rated. We will be using k-nearest neighbour algorithm which we will implement from scratch

In [82]:
import pandas as pd

In [83]:
movie_file = 'Data/movies.csv'
movie_data = pd.read_csv(movie_file, usecols = [0, 1], encoding = 'utf-8')

In [84]:
rating_file = 'Data/rating.csv'
rating_info = pd.read_csv(rating_file, usecols = [0,1,2] )

In [85]:
movie_info = pd.merge(movie_data, rating_info, left_on = 'MovieID', right_on = 'MovieID')
movie_info.head()


Unnamed: 0,MovieID,MovieName,UserID,Rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


In [86]:
num_user = max(movie_info.UserID)
print(num_user)
num_movies = max(movie_info.MovieID)
print(num_movies)


943
1682


In [90]:
# how many movies were rated by each user 
# value_counts : If True then the object returned will contain the relative frequencies of the unique values.
movies_per_user = movie_info.UserID.value_counts()
movies_per_user.head()


405    737
655    685
13     636
450    540
276    518
Name: UserID, dtype: int64

In [None]:
# how many number of users rated each movie
users_per_movie = movie_info.MovieName.value_counts()
users_per_movie.head()

In [None]:
# Function to find top N favourite movies of a user
def fav_movies(current_user, N):
    fav_movies = pd.DataFrame.sort_values(movie_info[movie_info.UserID == current_user], ['Rating'], ascending = [0]) [:N]
    return list(fav_movies.MovieName)

print(fav_movies(176, 5))


In [None]:
# Let's create a matrix that has the user ids on one axis and the movie title on another axis. 
# Each cell will then consist of the rating the user gave to that movie. 

                                # Lets build recommendation engine now

# We will use a neighbour based collaborative filtering model.
# The idea is to use k-nearest neighbour algorithm to find neighbours of a user
# We will use their ratings to predict ratings of a movie not already rated by a current user.
# We will represent movies watched by a user in a vector - the vector will have values for all the movies in our dataset. If a user hasn't rated a movie, it would be represented as NaN.



user_movie_rating_matrix  = pd.pivot_table(movie_info, values = 'Rating', index=['UserID'], columns=['MovieID'])
user_movie_rating_matrix.head()

In [None]:
# Now, we will find the similarity between 2 users by using correlation
from scipy.spatial.distance import correlation
import numpy as np
def similarity(user1, user2):
    # normalizing user1 rating i.e mean rating of user1 for any movie
    # nanmean will return mean of an array after ignore NaN values 
    user1 = np.array(user1) - np.nanmean(user1) 
    user2 = np.array(user2) - np.nanmean(user2)
    
    # finding the similarity between 2 users
    # finding subset of movies rated by both the users
    common_movie_ids = [i for i in range(len(user1)) if user1[i] > 0 and user2[i] > 0]
    if(len(common_movie_ids) == 0):
        return 0
    else:
        user1 = np.array([user1[i] for i in common_movie_ids])
        user2 = np.array([user2[i] for i in common_movie_ids])
        return correlation(user1, user2)


In [91]:
# We will now use the similarity function to find the nearest neighbour of a current user
# nearest_neighbour_ratings function will find the k nearest neighbours of the current user and
# then use their ratings to predict the current users ratings for other unrated movies 
def nearest_neighbour_ratings(current_user, K):
     # Creating an empty matrix whose row index is userId and the value
    # will be the similarity of that user to the current user
    similarity_matrix = pd.DataFrame(index = user_movie_rating_matrix.index, 
                                    columns = ['similarity'])
    for i in user_movie_rating_matrix.index:
        # finding the similarity between user i and the current user and add it to the similarity matrix
        similarity_matrix.loc[i] = similarity(user_movie_rating_matrix.loc[current_user],
                                             user_movie_rating_matrix.loc[i])
        # Sorting the similarity matrix in descending order
    similarity_matrix = pd.DataFrame.sort_values(similarity_matrix,
                                                ['similarity'], ascending= [0])
    # now we will pick the top k nearest neighbour
    # neighbour_movie_ratings : ratings of movies of neighbors
    # user_movie_rating_matrix : ratings of each user for every movie
    # predicted_rating : Averge where rating is NaN
    nearest_neighbours = similarity_matrix[:K]
    neighbour_movie_ratings = user_movie_rating_matrix.loc[nearest_neighbours.index]
     # This is empty dataframe placeholder for predicting the rating of current user using neighbour movie ratings
    predicted_movie_rating = pd.DataFrame(index = user_movie_rating_matrix.columns, columns = ['rating'])
 # Iterating all movies for a current user
    for i in user_movie_rating_matrix.columns:
        # by default, make predicted rating as the average rating of the current user
        predicted_rating = np.nanmean(user_movie_rating_matrix.loc[current_user])
          # j is user , i is movie
        for j in neighbour_movie_ratings.index:
            # if user j has rated the ith movie
            if(user_movie_rating_matrix.loc[j,i] > 0):# If there is some rating  # nearest_neighbours.loc[j, 'similarity']) / nearest_neighbours['similarity'].sum(): Finding Similarity score
                predicted_rating += ((user_movie_rating_matrix.loc[j,i] - np.nanmean(user_movie_rating_matrix.loc[j])) *
                                                    nearest_neighbours.loc[j, 'similarity']) / nearest_neighbours['similarity'].sum()

        predicted_movie_rating.loc[i, 'rating'] = predicted_rating

    return predicted_movie_rating
    

In [94]:
# Predicting top N recommendations for a current user
def top_n_recommendations(current_user, N):
    predicted_movie_rating = nearest_neighbour_ratings(current_user, 10)
    movies_already_watched = list(user_movie_rating_matrix.loc[current_user]
                                  .loc[user_movie_rating_matrix.loc[current_user] > 0].index)
    
    predicted_movie_rating = predicted_movie_rating.drop(movies_already_watched)
    
    top_n_recommendations = pd.DataFrame.sort_values(predicted_movie_rating, ['rating'], ascending=[0])[:N]
    
    top_n_recommendation_titles = movie_data.loc[movie_data.MovieID.isin(top_n_recommendations.index)]

    return list(top_n_recommendation_titles.MovieName)

In [95]:
# finding out the recommendations for a user
current_user = 140
print("User's favorite movies are : ", fav_movies(current_user, 5),
      "\nUser's top recommendations are: ", top_n_recommendations(current_user, 3))

  dist = 1.0 - uv / np.sqrt(uu * vv)


User's favorite movies are :  ['English Patient, The (1996)', "Ulee's Gold (1997)", 'Fly Away Home (1996)', 'Chasing Amy (1997)', 'Soul Food (1997)'] 
User's top recommendations are:  ['Star Wars (1977)', 'Shawshank Redemption, The (1994)', 'Godfather, The (1972)']
