In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise
from scipy.sparse import csr_matrix
import sklearn
import pickle

In [2]:
# DRY - Don't repeat yourself!
# place a utils.py in the same folder as the notebook
# from utils import example_query, create_user_vector, create_rating_matrix

# for calculating recommendations
example_query = {
    # movieId, rating
    4470:5, 
    48:5,
    594:5,
    27619:5,
    152081:5,
    595:5,
    616:5,
    1029:5
}

# Neighborhood Based Filtering for Recommender Systems
---

> The key idea is that the rating of u for a new item i is likely to be similar to that of another user v,if u and v have rated other items in a similar way. Likewise,u is likely to rate two items i and j in a similar fashion, if other users have given similar ratings to these two items.

##### Use ratings of similar users (or items) to predict what you like! But: How can we measure similarity/distance? 

- Cosine Similariy/Distance (works good for sparse high dimensional data)
- Jaccard Similarity/Distance (only works on binarized vectors)
- Pearson Correlation/Distance (cosine similarity on centered vectors)
- Euclidian Distance/Similarity (not good for sparse high dimensional data)

You find many more metrics here:https://docs.scipy.org/doc/scipy/reference/spatial.distance.html

In [3]:
cd ml-latest-small/

[Errno 2] No such file or directory: 'ml-latest-small/'
/Users/stefan/Documents/euclidean-eukalyptus/euclidean-eukalyptus-student-code/week_10/notebooks


In [4]:
ls

exploratory_analysis_worksheet.ipynb    nmf_recommender.pkl
matrix_factorization_filled.ipynb       recommender_systems_intro_filled.ipynb
neighborhood_based_filtering.ipynb


In [6]:
ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')
movies = pd.read_csv('../data/ml-latest-small/movies.csv')
ratings.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
# movies liked by our test user
movies.set_index('movieId').loc[example_query.keys()]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4470,Ariel (1988),Drama
48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance
594,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical
27619,"Lion King 1½, The (2004)",Adventure|Animation|Children|Comedy
152081,Zootopia (2016),Action|Adventure|Animation|Children|Comedy
595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
616,"Aristocats, The (1970)",Animation|Children
1029,Dumbo (1941),Animation|Children|Drama|Musical


---
## 1. Model Development

### Preprocessing (same as for the NMF model!)

- filter out movies rated by less than 20/ 50 / 100 ... users
- filter out movies with an average rating lower than 2
- create a sparse user item matrix

In [8]:
# place a utils.py in the same folder as the notebook
# from utils import ratings, get_ratings_matrix

# R = get_ratings_matrix(ratings)

In [9]:
# calculate the number of ratings per movie
ratings_per_movie = ratings.groupby('movieId')['rating'].count()
# filter for movies with more than 20 ratings and extract the index
popular_movies = ratings_per_movie.loc[ratings_per_movie>30]
# filter the ratings matrix and only keep the popular movies
ratings = ratings.loc[ratings['movieId'].isin(popular_movies.index)]

# Initialize a sparse user-item rating matrix
# (data, (row_ind, col_ind)
R = csr_matrix((ratings['rating'],(ratings['userId'],ratings['movieId'])))


In [10]:
R

<611x152082 sparse matrix of type '<class 'numpy.float64'>'
	with 57358 stored elements in Compressed Sparse Row format>

### Training (new!)

- initialize the model: pick a distance metric
- fit it to the user item matrix: only stores the data and doesn't do further. all the calculations take place later!

In [11]:
# which metrics can we use for sparse matrics?
sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute'])

['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'precomputed']

In [12]:
# initialize the unsupervised model
model = NearestNeighbors(metric='cosine')

# fit it to the user-item rating matrix
model.fit(R)

NearestNeighbors(metric='cosine')

### Save the trained model on your hard drive

In [13]:
with open('./distance_recommender.pkl', 'wb') as file:
    pickle.dump(model, file)

In [None]:
ls

---
## 2. Model deployment: Make recommendations for a new user

### Read the model from hard drive

In [None]:
with open('./distance_recommender.pkl', 'rb') as file:
    model = pickle.load(file)

In [None]:
ls

### Receive a user query

In [14]:
example_query

{4470: 5, 48: 5, 594: 5, 27619: 5, 152081: 5, 595: 5, 616: 5, 1029: 5}

### Construct a user vector (same as before!)

we need the same input as was used during training!

In [16]:
def create_user_vec(query, R):
    data = list(query.values())             # the ratings of the new user
    # we use just a single row 0 for this user
    row_ind = [0]*len(data)
    # the columns (=movieId) of the ratings
    col_ind = list(query.keys())
    data, row_ind, col_ind

    # new user vector: needs to have the same format as the training data
    user_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, R.shape[1]))
    return user_vec


In [17]:
R.shape

(611, 152082)

In [19]:
user_vec = create_user_vec(example_query,R)

### Calculate the score (new!)

1. find the neighborhood of $n$ similar users
2. use their ratings to calculate a score

In [20]:
# calculates the distances to all other users in the data!
distances, userIds = model.kneighbors(user_vec, n_neighbors=10, return_distance=True)

# sklearn returns a list of predictions - extract the first and only value of the list
distances = distances[0]
userIds = userIds[0]

In [21]:
distances, userIds

(array([0.8011156 , 0.80974514, 0.82597507, 0.8367268 , 0.84172777,
        0.84708378, 0.84988048, 0.85005996, 0.85067004, 0.85089198]),
 array([476, 563,  43, 484,   5, 138,  58, 170,  20, 216]))

In [22]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [23]:
# only look at ratings for users that are similar!
neighborhood = ratings.set_index('userId').loc[userIds]
neighborhood

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
476,1,4.0,835021447
476,2,4.0,835021693
476,10,3.0,835021420
476,11,3.0,835021635
476,32,4.0,835021513
...,...,...,...
216,3977,4.0,974733513
216,3994,5.0,975598752
216,3996,4.0,982169907
216,4025,2.0,982169965


In [24]:
scores = neighborhood.groupby('movieId')['rating'].mean()
scores

movieId
1         4.1
2         3.0
3         4.0
5         4.0
7         5.0
         ... 
91500     3.5
92259     3.5
112552    4.5
119145    4.0
134853    4.0
Name: rating, Length: 430, dtype: float64

In [None]:
# calculate the summed up rating for each movie
# summing up introduces a bias for popular movies
# averaging introduces bias for movies only seen by few users in the neighboorhood

### Give recommendations (same as before!)

In [None]:
example_query.keys()

In [None]:
# give a zero score to movies the user has allready seen
scores.loc[scores.index.isin(example_query.keys())] = 0 


In [None]:
# sort the scores from high to low 
scores.sort_values(ascending=False,inplace=True)

In [None]:
# get the movieIds of the top 10 entries
scores_10 = scores.head(10)

In [None]:
# let's see the recommendations!

recommendations = movies.set_index('movieId').loc[scores_10.index]
recommendations


---
## 3. Project Task: neighborhood based recommender function

- Collect different example queries for "typical" users (e.g. a horror movie buff) and try out the algorithm
- Set the number of neighbors to a very high or low number. What happens to the recommendations?
- Implement a recommender function that recommends movies to a new user based on the NearestNeighbor model!


- ⭐ **Bonus**: Calculate the score using a weighted sum or average. Use the distances to the other users as weights
- ⭐ **Bonus**: Use the method to find and recommend similar movies! Hint: Run the model on the transposed user item rating matrix.
- ⭐ **Bonus**: First use NMF to reduce the dimensionality of the sparse user item matrix. Then run neighborhood based recommendation on the dense matrix.

In [None]:
# collaborative filtering = look at ratings only!
def recommend_neighborhood(query, model, ratings, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained nearest neighbors model. 
    Returns a list of k movie ids.
    """
    # 1. candiate generation
    
    # construct a user vector
    
   
    # 2. scoring
    
    # find n neighbors
    
    # calculate their average rating
    
    
    # 3. ranking
    
    # filter out movies allready seen by the user
    
    # return the top-k highst rated movie ids or titles
    
    return [364, 372, 43, 34, 243]

In [None]:
# recommender.py
# from recommender import recommend_neighborhood