In [1]:
# make necesarry imports
from math import sqrt
import sys, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as metrics

from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation, cosine
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error

from contextlib import contextmanager

In [2]:
# M is user-item ratings matrix where ratings are integers from 1-10
# rows - denote users
# columns - items
# cells - ratings
ratings_M = np.asarray([[3,7,4,9,9,7], 
                        [7,0,5,3,8,8],
                        [7,5,5,0,8,4],
                        [5,6,8,5,9,8],
                        [5,8,8,8,10,9],
                        [7,7,0,4,7,8]])

df_ratings_M = pd.DataFrame(ratings_M)

In [3]:
# declaring k, metric as global which can be changed by the user later
global k, metric

# defaults
metric = 'cosine' 
k      = 3

In [4]:
df_ratings_M

Unnamed: 0,0,1,2,3,4,5
0,3,7,4,9,9,7
1,7,0,5,3,8,8
2,7,5,5,0,8,4
3,5,6,8,5,9,8
4,5,8,8,8,10,9
5,7,7,0,4,7,8


# (A) User-based Recommendation Systems

In [5]:
# get cosine similarities for ratings matrix M; 
# pairwise_distances returns the distances between ratings and hence
# similarities are obtained by subtracting distances from 1
cosine_sim = 1 - pairwise_distances(df_ratings_M, metric="cosine")

In [7]:
cosine_sim.shape

(6, 6)

In [6]:
cosine_sim

array([[1.        , 0.79926798, 0.77922652, 0.93462168, 0.97388994,
        0.88460046],
       [0.79926798, 1.        , 0.87474441, 0.90584982, 0.86614571,
        0.82703601],
       [0.77922652, 0.87474441, 1.        , 0.90951269, 0.86545388,
        0.85327496],
       [0.93462168, 0.90584982, 0.90951269, 1.        , 0.98934361,
        0.86561362],
       [0.97388994, 0.86614571, 0.86545388, 0.98934361, 1.        ,
        0.88164025],
       [0.88460046, 0.82703601, 0.85327496, 0.86561362, 0.88164025,
        1.        ]])

In [8]:
# Cosine similarity matrix
pd.DataFrame(cosine_sim)

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.799268,0.779227,0.934622,0.97389,0.8846
1,0.799268,1.0,0.874744,0.90585,0.866146,0.827036
2,0.779227,0.874744,1.0,0.909513,0.865454,0.853275
3,0.934622,0.90585,0.909513,1.0,0.989344,0.865614
4,0.97389,0.866146,0.865454,0.989344,1.0,0.88164
5,0.8846,0.827036,0.853275,0.865614,0.88164,1.0


In [9]:
# get pearson similarities for ratings matrix M
pearson_sim = 1 - pairwise_distances(df_ratings_M, metric="correlation")

In [10]:
# Pearson correlation similarity matrix
pd.DataFrame(pearson_sim)

Unnamed: 0,0,1,2,3,4,5
0,1.0,-0.137446,-0.357398,0.208179,0.761905,0.27735
1,-0.137446,1.0,0.453897,0.51591,0.112456,0.218328
2,-0.357398,0.453897,1.0,0.451378,-0.042888,0.297373
3,0.208179,0.51591,0.451378,1.0,0.763325,-0.057739
4,0.761905,0.112456,-0.042888,0.763325,1.0,0.039621
5,0.27735,0.218328,0.297373,-0.057739,0.039621,1.0


#### understand numpy.reshape

In [11]:
z = np.array([[1, 2, 3, 4],
              [5, 6, 7, 8],
              [9, 10, 11, 12]])
z.shape

(3, 4)

In [12]:
# Result new shape is (12,) and is compatible with original shape (3,4)
z.reshape(-1)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [13]:
# Now trying to reshape with (-1, 1) . We have provided column as 1 but rows as unknown . 
# So we get result new shape as (12, 1).again compatible with original shape(3,4)
z.reshape(-1, 1)

array([[ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10],
       [11],
       [12]])

In [14]:
# New shape as (-1, 2). row unknown, column 2. we get result new shape as (6, 2)
z.reshape(-1, 2)

array([[ 1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8],
       [ 9, 10],
       [11, 12]])

In [15]:
# Now trying to keep column as unknown. 
# New shape as (1,-1). i.e, row is 1, column unknown. we get result new shape as (1, 12)

z.reshape(1, -1)

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12]])

In [16]:
# New shape (2, -1). Row 2, column unknown. we get result new shape as (2,6)
z.reshape(2, -1)

array([[ 1,  2,  3,  4,  5,  6],
       [ 7,  8,  9, 10, 11, 12]])

In [17]:
# New shape as (3, -1). Row 3, column unknown. we get result new shape as (3,4)
z.reshape(3, -1)

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

In [18]:
# And finally, if we try to provide both dimension as unknown i.e new shape as (-1,-1). 
# It will throw an error
# z.reshape(-1, -1)

... back to main code 

In [23]:
# This function finds k similar users given the user_id and ratings matrix M
# Note that the similarities are same as obtained via using pairwise_distances
def findksimilarusers(user_id, 
                      ratings, 
                      metric = metric, 
                      k=3):
    similarities=[]
    indices     =[]
    
    model_knn   = NearestNeighbors(metric = metric, algorithm = 'brute') 
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[user_id-1, :].values.reshape(1, -1), 
                                              n_neighbors = k+1)
    
    similarities       = 1 - distances.flatten()
    
    print ('{0} most similar users for User {1}:\n'.format(k, user_id))
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == user_id:
            continue;

        else:
            print ('{0}: User {1}, with similarity of {2}'.format(i, indices.flatten()[i]+1, similarities.flatten()[i]))
            
    return similarities, indices

In [24]:
similarities,indices = findksimilarusers(1, df_ratings_M, metric='cosine')

3 most similar users for User 1:

1: User 5, with similarity of 0.9738899354018393
2: User 4, with similarity of 0.934621684178377
3: User 6, with similarity of 0.8846004572297814


In [25]:
similarities,indices = findksimilarusers(1, df_ratings_M, metric='correlation')

3 most similar users for User 1:

1: User 5, with similarity of 0.7619047619047619
2: User 6, with similarity of 0.2773500981126146
3: User 4, with similarity of 0.20817945092665124


In [113]:
# This function predicts rating for specified user-item combination based on user-based approach
def predict_userbased(user_id, 
                      item_id, 
                      ratings, 
                      metric = metric, 
                      k=k):
    prediction=0
    
    # similar users based on cosine similarity
    similarities, indices = findksimilarusers(user_id, ratings, metric, k) 
    
    # to adjust for zero based indexing
    mean_rating = ratings.loc[user_id-1, :].mean() 
    
    sum_wt  = np.sum(similarities)-1
    product = 1
    wtd_sum = 0 
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == user_id:
            continue;
        else: 
            ratings_diff = ratings.iloc[indices.flatten()[i],item_id-1]-np.mean(ratings.iloc[indices.flatten()[i],:])
            product      = ratings_diff * (similarities[i])
            wtd_sum      = wtd_sum + product
    
    prediction = int(round(mean_rating + (wtd_sum/sum_wt)))
    
    print ('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction))

    return prediction

In [114]:
# user id and item id
predict_userbased(3, 4, M);

4 most similar users for User 3:

1: User 4, with similarity of 0.9095126893401909
2: User 2, with similarity of 0.8747444148494656
3: User 5, with similarity of 0.8654538781497916
4: User 6, with similarity of 0.853274963343837

Predicted rating for user 3 -> item 4: 3


# (B) Item-based Recommendation Systems

In [115]:
# This function finds k similar items given the item_id and ratings matrix M
# rows - denote items
# columns - users
# cells - ratings

def findksimilaritems(item_id, 
                      ratings, 
                      metric=metric, 
                      k=k):
    similarities = []
    indices      = []    
    ratings      = ratings.T
    
    model_knn    = NearestNeighbors(metric = metric, algorithm = 'brute')
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[item_id-1, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities       = 1-distances.flatten()
    
    print ('{0} most similar items for item {1}:\n'.format(k, item_id))
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == item_id:
            continue;

        else:
            print ('{0}: Item {1} :, with similarity of {2}'.format(i,indices.flatten()[i]+1, similarities.flatten()[i]))


    return similarities, indices

In [116]:
similarities, indices=findksimilaritems(3,M)

4 most similar items for item 3:

1: Item 5 :, with similarity of 0.9183361255345219
2: Item 6 :, with similarity of 0.8747597730381951
3: Item 1 :, with similarity of 0.8103647462221737
4: Item 4 :, with similarity of 0.7969178003023933


In [118]:
# This function predicts the rating for specified user-item combination based on item-based approach
def predict_itembased(user_id, 
                      item_id, 
                      ratings, 
                      metric = metric, 
                      k=k):
    
    prediction= wtd_sum =0
    
    #similar users based on correlation coefficients
    similarities, indices=findksimilaritems(item_id, ratings) 
    
    sum_wt = np.sum(similarities)-1
    product=1
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == item_id:
            continue;
        else:
            product = ratings.iloc[user_id-1,indices.flatten()[i]] * (similarities[i])
            wtd_sum = wtd_sum + product
            
    prediction = int(round(wtd_sum/sum_wt))
    
    print ('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction))

    return prediction

In [119]:
# user ID and Item ID
prediction = predict_itembased(1, 3, M)

4 most similar items for item 3:

1: Item 5 :, with similarity of 0.9183361255345219
2: Item 6 :, with similarity of 0.8747597730381951
3: Item 1 :, with similarity of 0.8103647462221737
4: Item 4 :, with similarity of 0.7969178003023933

Predicted rating for user 1 -> item 3: 7


In [120]:
#This function is used to compute adjusted cosine similarity matrix for items
def computeAdjCosSim(M):
    sim_matrix = np.zeros((M.shape[1], M.shape[1]))
    M_u = M.mean(axis=1) #means
          
    for i in range(M.shape[1]):
        for j in range(M.shape[1]):
            if i == j:
                
                sim_matrix[i][j] = 1
            else:                
                if i<j:
                    
                    sum_num = sum_den1 = sum_den2 = 0
                    for k,row in M.loc[:,[i,j]].iterrows(): 

                        if ((M.loc[k,i] != 0) & (M.loc[k,j] != 0)):
                            num = (M[i][k]-M_u[k])*(M[j][k]-M_u[k])
                            den1= (M[i][k]-M_u[k])**2
                            den2= (M[j][k]-M_u[k])**2
                            
                            sum_num = sum_num + num
                            sum_den1 = sum_den1 + den1
                            sum_den2 = sum_den2 + den2
                        
                        else:
                            continue                          
                                       
                    den=(sum_den1**0.5)*(sum_den2**0.5)
                    if den!=0:
                        sim_matrix[i][j] = sum_num/den
                    else:
                        sim_matrix[i][j] = 0


                else:
                    sim_matrix[i][j] = sim_matrix[j][i]           
            
    return pd.DataFrame(sim_matrix)

In [121]:
adjcos_sim = computeAdjCosSim(M)

In [122]:
adjcos_sim

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.2369,0.4213,-0.5191,-0.1259,0.0101
1,0.2369,1.0,-0.8052,0.0857,0.2373,0.5206
2,0.4213,-0.8052,1.0,-0.7679,-0.2305,-0.0536
3,-0.5191,0.0857,-0.7679,1.0,-0.2991,-0.6446
4,-0.1259,0.2373,-0.2305,-0.2991,1.0,0.5992
5,0.0101,0.5206,-0.0536,-0.6446,0.5992,1.0


In [124]:
# This function finds k similar items given the item_id and ratings matrix M

def findksimilaritems_adjcos(item_id, 
                             ratings, 
                             k=k):
    
    sim_matrix   = computeAdjCosSim(ratings)
    similarities = sim_matrix[item_id-1].sort_values(ascending=False)[:k+1].values
    
    indices      = sim_matrix[item_id-1].sort_values(ascending=False)[:k+1].index
    
    print ('{0} most similar items for item {1}:\n'.format(k,item_id))
    
    for i in range(0, len(indices)):
            if indices[i]+1 == item_id:
                continue;

            else:
                print ('{0}: Item {1} :, with similarity of {2}'.format(i,indices[i]+1, similarities[i]))
        
    return similarities ,indices

In [125]:
# for item ID = 3
similarities, indices = findksimilaritems_adjcos(3, M)

4 most similar items for item 3:

1: Item 1 :, with similarity of 0.42126273187113467
2: Item 6 :, with similarity of -0.05363989048891138
3: Item 5 :, with similarity of -0.2305213582693748
4: Item 4 :, with similarity of -0.7679410465751941


- This function predicts the rating for specified user-item combination for __adjusted cosine__ item-based approach

- As the adjusted cosine similarities range from -1, +1, sometimes the predicted rating can be negative or greater than max value 

- Hack to deal with this: 
    - Rating is set to min if prediction is negative, 
    - Rating is set to max if prediction is above max

In [126]:
def predict_itembased_adjcos(user_id, 
                             item_id, 
                             ratings):
    prediction=0

    #similar users based on correlation coefficients
    similarities, indices = findksimilaritems_adjcos(item_id, ratings) 
    sum_wt = np.sum(similarities)-1

    product=1
    wtd_sum = 0 
    
    for i in range(0, len(indices)):
        if indices[i]+1 == item_id:
            continue;
        else:
            product = ratings.iloc[user_id-1,indices[i]] * (similarities[i])
            wtd_sum = wtd_sum + product    
            
    prediction = int(round(wtd_sum/sum_wt))
    
    if prediction < 0:
        prediction = 1
    elif prediction >10:
        prediction = 10
    print ('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction) )     
        
    return prediction

In [127]:
prediction=predict_itembased_adjcos(3,4,M)

4 most similar items for item 4:

1: Item 2 :, with similarity of 0.08574143411490752
2: Item 5 :, with similarity of -0.2990588277904165
3: Item 1 :, with similarity of -0.5190852688949424
4: Item 6 :, with similarity of -0.6445502869540708

Predicted rating for user 3 -> item 4: 6


In [128]:
adjcos_sim

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.2369,0.4213,-0.5191,-0.1259,0.0101
1,0.2369,1.0,-0.8052,0.0857,0.2373,0.5206
2,0.4213,-0.8052,1.0,-0.7679,-0.2305,-0.0536
3,-0.5191,0.0857,-0.7679,1.0,-0.2991,-0.6446
4,-0.1259,0.2373,-0.2305,-0.2991,1.0,0.5992
5,0.0101,0.5206,-0.0536,-0.6446,0.5992,1.0


This function utilizes above function to recommend items for selected approach. 

Recommendations are made if the predicted rating for an item is greater than or equal to 6, and the items has not been rated already

In [129]:
def recommendItem(user_id, item_id, ratings):
    
    if user_id <1 or user_id > 6 or type(user_id) is not int:
        print ('Userid does not exist. Enter numbers from 1-6')
    else:    
        ids = ['User-based CF (cosine)',
               'User-based CF (correlation)',
               'Item-based CF (cosine)',
               'Item-based CF (adjusted cosine)']

        approach = widgets.Dropdown(options=ids, 
                                    value=ids[0],
                                    description='Select Approach', 
                                    width='500px')
        
        def on_change(change):
            prediction = 0
            clear_output(wait=True)
            if change['type'] == 'change' and change['name'] == 'value':            
                if (approach.value == 'User-based CF (cosine)'):
                    metric = 'cosine'
                    prediction = predict_userbased(user_id, item_id, ratings, metric)
                elif (approach.value == 'User-based CF (correlation)')  :                       
                    metric = 'correlation'               
                    prediction = predict_userbased(user_id, item_id, ratings, metric)
                elif (approach.value == 'Item-based CF (cosine)'):
                    prediction = predict_itembased(user_id, item_id, ratings)
                else:
                    prediction = predict_itembased_adjcos(user_id,item_id,ratings)

                if ratings[item_id-1][user_id-1] != 0: 
                    print ('Item already rated')
                else:
                    if prediction>=6:
                        print ('\nItem recommended')
                    else:
                        print ('Item not recommended')

        approach.observe(on_change)
        display(approach)

In [130]:
#check for incorrect entries
recommendItem(-1,3,M)

Userid does not exist. Enter numbers from 1-6


In [133]:
recommendItem(3,4,M)
