# Collaborative Filtering with KNN
Author: Shiyi Wang

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from pprint import pprint

Load dataset

In [5]:
data = pd.read_pickle('../data/processed_data.pkl')

Unnamed: 0,user_id,recipe_id,rating,name
0,38094,40893,4,white bean green chile pepper soup
1,1293707,40893,5,white bean green chile pepper soup
3,126440,85009,5,baked potato toppings
4,57222,85009,5,baked potato toppings
5,52282,120345,4,sugared raspberries
...,...,...,...,...
1132360,2002357020,82303,5,easy microwave hot fudge topping
1132361,102526,54493,0,garlic clove chicken
1132364,157126,78003,5,pot roast with port stove top
1132365,53932,78003,4,pot roast with port stove top


Trancates dataset size for proper runs.

In [3]:
data = data.drop(data.index[100000:])

Calculate transposed matrix

In [4]:
tmat = data.pivot_table(index = 'user_id', columns = 'name',values = 'rating').fillna(0)

Apply KNN to train our model

In [5]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(tmat.values)
user_neigh_dist, user_neigh_ind = knn.kneighbors(tmat.values, n_neighbors=6)
user_neigh_ind, user_neigh_dist

(array([[    0,  9133, 28108, 26964, 27122, 32317],
        [    1, 13332,  7111,  2923, 18836, 15419],
        [    2, 10672, 22745,  8582, 26058, 20475],
        ...,
        [18632, 26995, 16361,  8428, 35910, 10344],
        [25782, 25780, 25783, 25785, 25781, 25779],
        [29973, 38671, 33452, 34736, 12792, 21308]]),
 array([[0.        , 0.69343033, 0.69343033, 0.69343033, 0.69343033,
         0.69343033],
        [0.        , 0.820626  , 0.8385634 , 0.85321519, 0.85354174,
         0.85354174],
        [0.        , 0.2       , 0.2       , 0.2       , 0.2       ,
         0.2       ],
        ...,
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        ],
        [1.        , 1.        , 1.        , 1.        , 1.        ,
         1.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        ]]))

Pack into API

In [6]:
def findSimilarUsers(user, n=5):
        # get neigh_users_dist, neigh_users_ind 
        neigh_users_dist, neigh_users_ind = knn.kneighbors(
            np.asarray([tmat.values[user - 1]]), n_neighbors=n + 1)
        # header helper print
        print('The top ' + str(n) +  ' most similar users of user ' + str(user) + ' are:')
        # iterate all and print the distances
        for i in range(1, len(neigh_users_dist[0])):
            print('No.' + str(i) + ": User ID: " + str(neigh_users_ind[0][i]+1) + ", with distance " + str(neigh_users_dist[0][i]))

        print("\n")

        return neigh_users_dist.flatten()[1:], neigh_users_ind.flatten()[1:] + 1

In [7]:
def getRecommendations(num_recipes_recommended, avg_rating, userId):
        # clean up the zero ratings
        zero_rating = np.where(avg_rating == 0)[0][-1]
        ranked_ind = np.argsort(avg_rating)[::-1]
        ranked_ind = ranked_ind[:list(ranked_ind).index(zero_rating)]
        # check input validity by comparing with recipes we have
        num_recipes_recommended = min(len(ranked_ind), num_recipes_recommended)
        # store seen recipes in a list
        seen = list(data[data['user_id'] == userId]['name'])
        recipes = list(tmat.columns[ranked_ind])
        # recommended count
        count = 0
        # store recommended recipes in the list
        recommended_recipes = []
        for recipe in recipes:
            # only append if not seen
            if recipe not in seen:
                recommended_recipes.append(recipe)
                count += 1
            if count == num_recipes_recommended:
                break

        pprint(recommended_recipes)

In [8]:
def recommend(userId, num_similar_users, num_recipes_recommended):

    print("User " + str(userId) + " has rated the following recipes: ")
    pprint(list(data[data['user_id'] == userId]['name']))
    print("\n")
    
    # retrieve neigh_users_dist and neigh_users_ind
    neigh_users_dist, neigh_users_ind = findSimilarUsers(userId, num_similar_users)
    # weight each distance based on the total distances
    weighted_user_neigh_dist = neigh_users_dist / np.sum(neigh_users_dist)
    # Broadcasting
    weighted_user_neigh_dist = weighted_user_neigh_dist[:, np.newaxis] + np.zeros(len(tmat.columns))
    # Calculate the average rating
    avg_rating =  (weighted_user_neigh_dist * tmat.values[neigh_users_ind]).sum(axis=0)
    # helper print function
    print("Based on other users rating, we recommend:")
    
    getRecommendations(num_recipes_recommended, avg_rating, userId)


Unit Test

In [9]:
recommend(userId = 38094, num_similar_users = 5, num_recipes_recommended = 5)

User 38094 has rated the following recipes: 
['white bean   green chile pepper soup',
 'black beans with mango sauce',
 'warm spinach salad',
 'roasted asparagus   shiitake mushrooms']


The top 5 most similar users of user 38094 are:
No.1: User ID: 6157, with distance 0.49748109237039395
No.2: User ID: 10430, with distance 0.5
No.3: User ID: 6995, with distance 0.7441777449516747
No.4: User ID: 16687, with distance 0.8222204663663055
No.5: User ID: 13401, with distance 0.8636138186025047


Based on other users rating, we recommend:
['chinese buffet style donuts',
 'bacon wrapped chicken  oamc',
 'taco bell taco sauce',
 'double chocolate pudding',
 'chocolate raspberry cobbler cake']
