# Collaborative Filtering with KNN
Author: Shiyi Wang

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from pprint import pprint

Load dataset

In [6]:
data = pd.read_pickle('../data/processed_data.pkl')
data

Unnamed: 0,rating,name,user_id,recipe_id
0,4,white bean green chile pepper soup,3787,16642
1,5,white bean green chile pepper soup,95286,16642
3,5,baked potato toppings,14502,34897
4,5,baked potato toppings,6559,34897
5,4,sugared raspberries,5690,49598
...,...,...,...,...
1132360,5,easy microwave hot fudge topping,215681,33795
1132361,0,garlic clove chicken,11621,22415
1132364,5,pot roast with port stove top,17831,32160
1132365,4,pot roast with port stove top,5947,32160


Trancates dataset size for proper runs.

In [7]:
data = data.drop(data.index[150000:])

Calculate transposed matrix

In [4]:
tmat = data.pivot_table(index = 'user_id', columns = 'name',values = 'rating').fillna(0)

Apply KNN to train our model

In [5]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(tmat.values)
user_neigh_dist, user_neigh_ind = knn.kneighbors(tmat.values, n_neighbors=6)
user_neigh_ind, user_neigh_dist

(array([[    0, 36197, 13474, 35677, 31509, 13303],
        [    1, 23541, 16523, 28017, 17658,  8647],
        [    2,  7859,  1719, 13140, 28748, 11980],
        ...,
        [51789,  8714, 34531, 34528, 34530, 34529],
        [34527, 34526, 34530, 34529, 34528, 34532],
        [29425, 51791, 44030, 31429, 45898, 26819]]),
 array([[0.00000000e+00, 7.54854831e-01, 7.54854831e-01, 7.54854831e-01,
         7.54854831e-01, 7.54854831e-01],
        [3.33066907e-16, 8.46297276e-01, 8.55087682e-01, 8.55087682e-01,
         8.56515424e-01, 8.69578913e-01],
        [0.00000000e+00, 4.54455274e-01, 4.54455274e-01, 5.63564220e-01,
         5.63564220e-01, 5.63564220e-01],
        ...,
        [0.00000000e+00, 9.00031235e-01, 1.00000000e+00, 1.00000000e+00,
         1.00000000e+00, 1.00000000e+00],
        [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
         1.00000000e+00, 1.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.0

In [2]:
def findSimilarUsers(user, n=5):
        # get neigh_users_dist, neigh_users_ind 
        neigh_users_dist, neigh_users_ind = knn.kneighbors(
            np.asarray([tmat.values[user - 1]]), n_neighbors=n + 1)
        # header helper print
        print('The top ' + str(n) +  ' most similar users of user ' + str(user) + ' are:')
        # iterate all and print the distances
        for i in range(1, len(neigh_users_dist[0])):
            print('No.' + str(i) + ": User ID: " + str(neigh_users_ind[0][i]+1) + ", with distance " + str(neigh_users_dist[0][i]))

        print("\n")

        return neigh_users_dist.flatten()[1:], neigh_users_ind.flatten()[1:] + 1

In [3]:
def getRecommendations(num_recipes_recommended, avg_rating, userId):
        # clean up the zero ratings
        zero_rating = np.where(avg_rating == 0)[0][-1]
        ranked_ind = np.argsort(avg_rating)[::-1]
        ranked_ind = ranked_ind[:list(ranked_ind).index(zero_rating)]
        # check input validity by comparing with recipes we have
        num_recipes_recommended = min(len(ranked_ind), num_recipes_recommended)
        # store seen recipes in a list
        seen = list(data[data['user_id'] == userId]['name'])
        recipes = list(tmat.columns[ranked_ind])
        # recommended count
        count = 0
        # store recommended recipes in the list
        recommended_recipes = []
        for recipe in recipes:
            # only append if not seen
            if recipe not in seen:
                recommended_recipes.append(recipe)
                count += 1
            if count == num_recipes_recommended:
                break

        pprint(recommended_recipes)

In [4]:
def recommend(userId, num_similar_users, num_recipes_recommended):

    print("User " + str(userId) + " has rated the following recipes: ")
    pprint(list(data[data['user_id'] == userId]['name']))
    print("\n")
    
    # retrieve neigh_users_dist and neigh_users_ind
    neigh_users_dist, neigh_users_ind = findSimilarUsers(userId, num_similar_users)
    # weight each distance based on the total distances
    weighted_user_neigh_dist = neigh_users_dist / np.sum(neigh_users_dist)
    # Broadcasting
    weighted_user_neigh_dist = weighted_user_neigh_dist[:, np.newaxis] + np.zeros(len(tmat.columns))
    # Calculate the average rating
    avg_rating =  (weighted_user_neigh_dist * tmat.values[neigh_users_ind]).sum(axis=0)
    # helper print function
    print("Based on other users rating, we recommend:")
    
    getRecommendations(num_recipes_recommended, avg_rating, userId)


Unit Test

In [8]:
recommend(userId = 3787, num_similar_users = 5, num_recipes_recommended = 10)

User 3787 has rated the following recipes: 
['white bean   green chile pepper soup',
 'black beans with mango sauce',
 'warm spinach salad',
 'roasted asparagus   shiitake mushrooms',
 'low carb key lime pie',
 'slow cooker hoppin  john',
 'west indian pumpkin soup']




NameError: name 'knn' is not defined