## Problem Statement
***
We have a movie lens database and our objective is to apply various kinds of recommendation techniques from scratch and find out similarities between the users, most popular movies, and personalized recommendations for the targeted user based on user based collaborative filtering.

In [20]:
# Importing the required libraries.
import pandas as pd
import numpy as np
from math import pow, sqrt

## Loading users, movie and ratings

In [21]:
# Reading users dataset into a pandas dataframe object.
users = pd.read_csv('data/users.dat', sep='::', names=['user_id', 'age', 'sex', 'occupation', 'zip_code'], 
                    encoding='latin-1', engine='python')
users

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [3]:
# Reading ratings dataset into a pandas dataframe object.
ratings = pd.read_csv('data/ratings.dat', sep='::', names=['user_id', 'movie_id', 'rating', 'unix_timestamp'],
                      encoding='latin-1', engine='python')
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [22]:
# Reading movies dataset into a pandas dataframe object.
movies = pd.read_csv('data/movies.dat', sep='::', names=['movie_id', 'movie_title', 'genre'], 
                     encoding='latin-1', engine='python')
movies

Unnamed: 0,movie_id,movie_title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


### Writing generally used getter functions in the implementation
Here, we have written down a few getters so that we do not need to write down them again adn again and it also increases readability and reusability of the code.

In [24]:
#Function to get the rating given by a user to a movie.
def get_rating_(userid,movieid):
    return (ratings.loc[(ratings.user_id==userid) & (ratings.movie_id == movieid),'rating'].iloc[0])

#Function to get the rating given by a user to a movie.
def get_rating_timestamp_(userid,movieid):
    return (ratings.loc[(ratings.user_id==userid) & (ratings.movie_id == movieid),'unix_timestamp'].iloc[0])

# Function to get the list of all movie ids the specified user has rated.
def get_movieids_(userid):
    return (ratings.loc[(ratings.user_id==userid),'movie_id'].tolist())

# Function to get the movie titles against the movie id.
def get_movie_title_(movieid):
    return (movies.loc[(movies.movie_id == movieid),'movie_title'].iloc[0])

# Function to get the movie genre against the movie id.
def get_movie_genre_(movieid):
    return (movies.loc[(movies.movie_id == movieid),'genre'].iloc[0])

## Create user-item rating matrix
Given $m$ movies, and $n$ users...

Construct a rating matrix $R$ of size $m\times n$

$R[i,j]$ is rating given by $j^{th}$ user to $i^{th}$ movie

Normalize the rating matrix to zero mean

In [26]:
max_user_id = ratings.user_id.max()
max_movie_id = ratings.movie_id.max()

rating_matrix = np.ndarray(shape=(max_movie_id, max_user_id),dtype=np.uint8)
rating_matrix[ratings.movie_id.values-1, ratings.user_id.values-1] = ratings.rating.values
print(rating_matrix)
normalised_matrix = rating_matrix - np.asarray([(np.mean(rating_matrix, 1))]).T
print(normalised_matrix)

[[5 0 0 ... 0 0 3]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[ 3.57400662 -1.42599338 -1.42599338 ... -1.42599338 -1.42599338
   1.57400662]
 [-0.37152318 -0.37152318 -0.37152318 ... -0.37152318 -0.37152318
  -0.37152318]
 [-0.23874172 -0.23874172 -0.23874172 ... -0.23874172 -0.23874172
  -0.23874172]
 ...
 [-0.03278146 -0.03278146 -0.03278146 ... -0.03278146 -0.03278146
  -0.03278146]
 [-0.02582781 -0.02582781 -0.02582781 ... -0.02582781 -0.02582781
  -0.02582781]
 [-0.24288079 -0.24288079 -0.24288079 ... -0.24288079 -0.24288079
  -0.24288079]]


## Perform SVD of user-item rating matrix

Tranpose rating matrix $A = R^T$

Perform SVD on rating matrix: $A  = U \Sigma V^T$

$U$ is user matrix of size $n \times n$

$V$ is movie item matrix of size $m \times m$

$\Sigma$ consists of $min(m,n)$ singular values

<img src="https://dorukkilitcioglu.com/assets/images/posts/representation-learning-2/svd.png" width="250" height="170">

In [27]:
A = normalised_matrix.T / np.sqrt(rating_matrix.shape[0] - 1)
user_matrix, singular_values, movie_matrix = np.linalg.svd(A)
print(movie_matrix.shape)
print(singular_values.shape)
print(user_matrix.shape)

(3952, 3952)
(3952,)
(6040, 6040)


In [28]:
num_features = 50  # take only 50 features 
movie_features = movie_matrix.T[:, :num_features]
user_features = user_matrix[:, :num_features]
print(movie_features.shape)
print(user_features.shape)

(3952, 50)
(6040, 50)


In [29]:
# identify top matches based on cosine similarity
def top_cosine_similarity(data, query_vector, top_n=10):
    dot_product = np.dot(query_vector, data.T) # dot product
    norm_a = np.linalg.norm(query_vector)
    norm_b = np.linalg.norm(data,axis=1)
    similarity = dot_product / ((norm_a * norm_b) + 1e-6) # cosine similarity
    sort_indexes = np.argsort(-similarity) # sort indices in drecreasing order of similarity
    return sort_indexes[:top_n] +1 # return top N indices

In [30]:
movie_id = 2
query_vector = movie_features[movie_id - 1, :]
top_cosine_similarity(movie_features, query_vector ,10) # find top 10 movies similar to movie id 2

array([   2, 3489,   60, 2162,  653, 1654, 2161,  317, 1583, 1848],
      dtype=int64)

In [32]:
user_id = 10
query_vector = user_features[user_id - 1, :]
top_cosine_similarity(user_features, query_vector, 10) # find top 10 similar users for user id 1

array([  10, 1120, 3013, 2242, 3512, 4958, 3259,   44, 4048, 5447],
      dtype=int64)

## Content-based Recommendations
Given a user $u$

1. Find $N$  movies $c_1,c_2,...,c_N$ recently watched and liked by user.

2. Create a user profile as the average of feature vectors of movies: $u = \frac{c_1 + c_2 + ... + c_N}{N}$

3. Given profile vector $u$, recomend $K$ similar movie titles $r_1, r_2, ..., r_K$ 
    based on their similarity with user profile: $\frac{\langle u, r_i \rangle}{\Vert u \Vert \cdot \Vert r_i \Vert}$

4. Remove movies previously watched by user  before recommending.

<img src="https://cdn-gcp.new.marutitech.com/medium_964bd023_recommendation_engine_2_63a641ad68.png" width="250" height="170">

In [33]:
def get_content_based_recommendations(user_id, top_n_movies, num_recommendations):
    movies_watched_by_user = get_movieids_(user_id)
    movies_liked_by_user = [(movie_id, get_rating_timestamp_(user_id, movie_id)) 
                            for movie_id in movies_watched_by_user if get_rating_(user_id, movie_id) in [4,5]]

    sorted(movies_liked_by_user, key= lambda x: -x[1])
    recent_movies_liked_by_user = [movie_id for (movie_id, timestamp) in movies_liked_by_user]
    
    user_profile_vector = [movie_features[movie_id-1, :] for movie_id in recent_movies_liked_by_user]
    user_profile_vector = np.mean(user_profile_vector, axis=0)

    movie_recommendations_content = list(top_cosine_similarity(movie_features, user_profile_vector, num_recommendations))

    movie_recommendations_content = set(movie_recommendations_content) - set(movies_watched_by_user)
    
    return recent_movies_liked_by_user, movie_recommendations_content

In [34]:
user_id = 20

# content-based parameters
top_n_movies = 10
num_recommendations = 25

recent_movies_liked_by_user, movie_recommendations_content = get_content_based_recommendations(user_id, 
                                                                                               top_n_movies, 
                                                                                               num_recommendations)

print("{} Movies liked by user {} recently\n".format(top_n_movies, user_id))
for id in recent_movies_liked_by_user[:top_n_movies]:
    print(get_movie_title_(id), get_movie_genre_(id))

print("\nMovie recommendations similar to movies watched by user are:\n")
for id in list(movie_recommendations_content):
    print(get_movie_title_(id), get_movie_genre_(id))

10 Movies liked by user 20 recently

Mission: Impossible (1996) Action|Adventure|Mystery
Terminator 2: Judgment Day (1991) Action|Sci-Fi|Thriller
Superman II (1980) Action|Adventure|Sci-Fi
Out of Sight (1998) Action|Crime|Romance
Matrix, The (1999) Action|Sci-Fi|Thriller
Predator (1987) Action|Sci-Fi|Thriller
There's Something About Mary (1998) Comedy
Star Trek: The Motion Picture (1979) Action|Adventure|Sci-Fi
Fifth Element, The (1997) Action|Sci-Fi
Gladiator (2000) Action|Drama

Movie recommendations similar to movies watched by user are:

Last of the Mohicans, The (1992) Action|Romance|War
Jurassic Park (1993) Action|Adventure|Sci-Fi
Dream Man (1995) Thriller
U-571 (2000) Action|Thriller
Run Lola Run (Lola rennt) (1998) Action|Crime|Romance
Face/Off (1997) Action|Sci-Fi|Thriller
Heat (1995) Action|Crime|Thriller
Total Recall (1990) Action|Adventure|Sci-Fi|Thriller
Hunt for Red October, The (1990) Action|Thriller
Men in Black (1997) Action|Adventure|Comedy|Sci-Fi
X-Men (2000) Action|

## Collaborative Recommendations
Given a user $u$

1. Find $N$ users $u_1,u_2,...,u_N$ similar to given user based on cosine similarity.

2. Create a user profile as the average of feature vectors of similar users: $u = \frac{u_1 + u_2 + ... + u_N}{N}$

3. Given profile vector $u$, recomend $K$ similar movie titles $r_1, r_2, ..., r_K$ 
    based on their similarity with user profile: $\frac{\langle u, r_i \rangle}{\Vert u \Vert \cdot \Vert r_i \Vert}$

4. Remove movies previously watched by user  before recommending.

<img src="https://cdn-gcp.new.marutitech.com/medium_f35bf62d_recommendation_engine_1_a609c2f06e.png" width="250" height="170">

In [35]:
def get_collaorative_recommendations(user_id, top_n_users, num_recommendations):
    movies_watched_by_user = get_movieids_(user_id)
    similar_user_ids = top_cosine_similarity(user_features, user_features[user_id-1, :], top_n_users)
    
    user_profile_vector = [user_features[uid - 1, :] for uid in similar_user_ids]
    user_profile_vector = np.mean(user_profile_vector, axis=0)
        
    movie_recommendations_collaborative = list(top_cosine_similarity(movie_features, user_profile_vector, num_recommendations))

    movie_recommendations_collaborative = set(movie_recommendations_collaborative) - set(movies_watched_by_user)

    
    return movie_recommendations_collaborative

In [36]:
user_id = 20

# collborative-based parameters
top_n_users = 15
num_recommendations = 25

movie_recommendations_collaborative = get_collaorative_recommendations(user_id, top_n_users, num_recommendations)

print("\nMovies recommendations based on similar user interests are:\n")
for id in list(movie_recommendations_collaborative):
    print(get_movie_title_(id), get_movie_genre_(id))



Movies recommendations based on similar user interests are:

Last of the Mohicans, The (1992) Action|Romance|War
Hillbillys in a Haunted House (1967) Comedy
Negotiator, The (1998) Action|Thriller
Full Tilt Boogie (1997) Documentary
Price of Glory (2000) Drama
Face/Off (1997) Action|Sci-Fi|Thriller
Sunchaser, The (1996) Drama
Brighton Beach Memoirs (1986) Comedy
Diebinnen (1995) Drama
Uninvited Guest, An (2000) Drama
Date with an Angel (1987) Comedy|Fantasy
Mariachi, El (1992) Action|Thriller
Bustin' Loose (1981) Comedy
Hunt for Red October, The (1990) Action|Thriller
Thelma & Louise (1991) Action|Drama
In the Line of Fire (1993) Action|Thriller
U-571 (2000) Action|Thriller
Total Recall (1990) Action|Adventure|Sci-Fi|Thriller
Under the Rainbow (1981) Comedy


In [37]:
set(movie_recommendations_collaborative).intersection(movie_recommendations_content)

{826, 1408, 1573, 1610, 2916, 3555}