# II. Recommender Systems

## 1. Basics of Recommendation Algorithm

In [64]:
from scipy.spatial.distance import cosine
import sklearn.metrics as metrics
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation, cosine
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error
import numpy as np
from numpy import nan as NaN

In [112]:
M_verbose = pd.DataFrame({
    "Desperados": [4, 1, NaN, 4],
    "Guinness": [3.0, 2.0, 2.0, 3.0],
    "Chimay Triple": [2, 3, 1, NaN],
    "Leffe": [3, 1, NaN, NaN]
})

M_verbose.index = ['ICT', 'Med', 'Business', 'Enviro']

M = np.array([
    [4, 3.0, 2, 3], 
    [1, 2, 3, 1],
    [NaN, 2, 1, NaN],
    [4, 3, NaN, NaN]
])

M_df = pd.DataFrame(data=arr)

### Compute similarities

#### Cosine

In [126]:
M[:,1]

array([3., 2., 2., 3.])

In [1]:
import math

def cosine_similarity(v1,v2, metric='cosine'):
    #metric: cosine or correlation
    if metric == 'correlation':
        v1 = v1 - np.nanmean(v1)
        v2 = v2 - np.nanmean(v2)
    "compute similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        if np.isnan(x) or np.isnan(y): continue
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

def sim_matrix(M, dimension='user', metric='cosine'):
    N = M.shape[0] if dimension == 'user' else M.shape[1]
    sim = np.zeros([N,N])
    for i in range(N):
        for j in range(N):
            if i == j:
                sim[i,j] = 0 #Cancel out the effect of self-similarity in the sums later
                continue
            if dimension == 'user':
                # If user, vector 1 and vector 2 = current iterations (i, j) of users to compare
                v1, v2 = M[i,:], M[j,:]
            else:
                # If item, vector 1 and vector 2 = current iterations (i, j) of drinks to compare
                v1, v2 = M[:,i], M[:,j]
                
            sim[i][j] = cosine_similarity(v1,v2,metric)
    return sim

In [161]:
cosine_similarity(M[0,:], M[2,:], 'cosine')

0.9922778767136677

In [162]:
M[0,:]

array([4., 3., 2., 3.])

In [163]:
M[2,:]

array([nan,  2.,  1., nan])

In [164]:
sim_matrix(M, 'user')

array([[0.        , 0.79582243, 0.99227788, 1.        ],
       [0.79582243, 0.        , 0.86824314, 0.89442719],
       [0.99227788, 0.86824314, 0.        , 1.        ],
       [1.        , 0.89442719, 1.        , 0.        ]])

In [165]:
sim_matrix(M, 'item')

array([[0.        , 0.9649505 , 0.73994007, 0.99705449],
       [0.9649505 , 0.        , 0.90748521, 0.96476382],
       [0.73994007, 0.90748521, 0.        , 0.78935222],
       [0.99705449, 0.96476382, 0.78935222, 0.        ]])

#### Pearson

In [166]:
cosine_similarity(M[0,:], M[2,:], 'correlation')

0.7071067811865475

In [167]:
sim_matrix(M, 'user', 'correlation')

array([[ 0.        , -0.85280287,  0.70710678,  0.70710678],
       [-0.85280287,  0.        , -0.5547002 , -0.89442719],
       [ 0.70710678, -0.5547002 ,  0.        , -1.        ],
       [ 0.70710678, -0.89442719, -1.        ,  0.        ]])

In [168]:
sim_matrix(M, 'item', 'correlation')

array([[ 0.        ,  0.94280904, -0.89442719,  0.9486833 ],
       [ 0.94280904,  0.        ,  0.        ,  1.        ],
       [-0.89442719,  0.        ,  0.        , -0.70710678],
       [ 0.9486833 ,  1.        , -0.70710678,  0.        ]])

### a) Compute the missing rating in this table using user-based collaborative filtering (CF). (Use cosine similarity, then use Pearson similarity). Assume taking all neighbors

In [169]:
# Axis 0 = rows, axis 1 = 
print("Medicine's ratings")
print(M[1,:])

print("Guiness's ratings")
print(M[:,1])

Medicine's ratings
[1. 2. 3. 1.]
Guiness's ratings
[3. 2. 2. 3.]


### def user_cf(M, metric='cosine'):
    pred = np.copy(M)
    n_users, n_items = M.shape
    # Average rating each user gave beers (ignoring nans)
    avg_ratings = np.nanmean(M, axis=1)
    sim_users = sim_matrix(M, 'user', metric)
    
    
    for i in range(n_users):
        for j in range(n_items):
            if np.isnan(M[i,j]):

                # Can include self sim because 0 will cancel itself
                similarities = sim_users[i]
                item_ratings = M[:, j]
                
                numerator = np.nansum(similarities * (item_ratings - avg_ratings))
                denominator = np.nansum(similarities)
                mean_rating = avg_ratings[i]
                
#                 print('\nMean rating: ', mean_rating)
#                 print('Numerator: ', numerator)
#                 print('Denominator: ', denominator)
                
                pred[i,j] = mean_rating + (numerator / denominator)

    return pred

In [206]:
print("User-based CF (Cosine): \n" + str(pd.DataFrame(user_cf(M, 'cosine'))))
print("User-based CF (Pearson): \n" + str(pd.DataFrame(user_cf(M, 'correlation'))))

User-based CF (Cosine): 
          0    1         2         3
0  4.000000  3.0  2.000000  3.000000
1  1.000000  2.0  3.000000  1.000000
2  1.794036  2.0  1.000000  1.272355
3  4.000000  3.0  3.368034  3.268237
User-based CF (Pearson): 
          0    1         2         3
0  4.000000  3.0  2.000000  3.000000
1  1.000000  2.0  3.000000  1.000000
2  0.764822  2.0  1.000000  1.009169
3  4.000000  3.0  4.616077  2.935013


### b) Similarly, computing the missing rating using item-based CF.

In [214]:
def item_cf(M, metric='cosine'):
    pred = np.copy(M)
    n_users, n_items = M.shape
    avg_ratings = np.nanmean(M, axis=0)
    sim_items = sim_matrix(M, 'item', metric)
    for i in range(n_users):
        for j in range(n_items):
            if np.isnan(M[i,j]):
                
                similarities = sim_items[j]

                users_other_ratings = M[i,:]

                mean_rating = avg_ratings[j]
                numerator = np.nansum((sim_items[j] * (users_other_ratings - avg_ratings)))
                denominator = np.nansum(sim_items[j])
                
                pred[i,j] = mean_rating + (numerator / denominator)
                
    return pred

In [215]:
print("Item-based CF (Cosine): \n" + str(pd.DataFrame(item_cf(M, 'cosine'))))
print("Item-based CF (Pearson): \n" + str(pd.DataFrame(item_cf(M, 'correlation'))))

Item-based CF (Cosine): 
         0    1         2         3
0  4.00000  3.0  2.000000  3.000000
1  1.00000  2.0  3.000000  1.000000
2  2.54758  2.0  1.000000  1.537748
3  4.00000  3.0  2.489861  2.537748
Item-based CF (Pearson): 
          0    1         2        3
0  4.000000  3.0  2.000000  3.00000
1  1.000000  2.0  3.000000  1.00000
2  3.424268  2.0  1.000000  2.16681
3  4.000000  3.0  2.558482  3.16681


## 2. Evaluating Recommendation Algorithms

### Predictive Accuracy

In [218]:
M_result = np.asarray([[4,3,2,3], 
                [1,2,3,1],
                [1,2,1,2],
                [4,3,2,4]])
pd.DataFrame(M_result)

Unnamed: 0,0,1,2,3
0,4,3,2,3
1,1,2,3,1
2,1,2,1,2
3,4,3,2,4


In [221]:
from math import sqrt
def evaluateRS(ratings, groundtruth, method='user_cf', metric='cosine'):
    #method: user_cf and item_cf, metric: cosine and correlation
    if method == 'user_cf':
        prediction = user_cf(ratings, metric)
    else:
        prediction = item_cf(ratings, metric)
    MSE = mean_squared_error(prediction, groundtruth)
    RMSE = round(sqrt(MSE),3)
    print("RMSE using {0} approach ({2}) is: {1}".format(method, RMSE, metric))
    print(pd.DataFrame(prediction))
    return

In [222]:
evaluateRS(M, M_result)
evaluateRS(M, M_result, metric='correlation')
evaluateRS(M, M_result, method='item_cf')
evaluateRS(M, M_result, method='item_cf', metric='correlation')

RMSE using user_cf approach (cosine) is: 0.472
          0    1         2         3
0  4.000000  3.0  2.000000  3.000000
1  1.000000  2.0  3.000000  1.000000
2  1.794036  2.0  1.000000  1.272355
3  4.000000  3.0  3.368034  3.268237
RMSE using user_cf approach (correlation) is: 0.751
          0    1         2         3
0  4.000000  3.0  2.000000  3.000000
1  1.000000  2.0  3.000000  1.000000
2  0.764822  2.0  1.000000  1.009169
3  4.000000  3.0  4.616077  2.935013
RMSE using item_cf approach (cosine) is: 0.558
         0    1         2         3
0  4.00000  3.0  2.000000  3.000000
1  1.00000  2.0  3.000000  1.000000
2  2.54758  2.0  1.000000  1.537748
3  4.00000  3.0  2.489861  2.537748
RMSE using item_cf approach (correlation) is: 0.657
          0    1         2        3
0  4.000000  3.0  2.000000  3.00000
1  1.000000  2.0  3.000000  1.00000
2  3.424268  2.0  1.000000  2.16681
3  4.000000  3.0  2.558482  3.16681


### Ranking Accuracy

In [232]:
import scipy.stats as stats

def evaluate_rank(ratings, groundtruth, n_users, method='user_cf', metric='cosine'):
    #metric: cosine vs correlation
    if method == 'user_cf':
        prediction = user_cf(ratings, metric)
    else:
        prediction = item_cf(ratings, metric)
    
    avg_tau = 0

    for i in range(n_users):
        tau, p_value = stats.kendalltau(M_result[i,:], prediction[i,:])
        avg_tau += tau
    avg_tau = avg_tau / n_users
    clear_output(wait=True)
    return avg_tau

cosine_user = evaluate_rank(M, M_result, 4)
correlation_user = evaluate_rank(M, M_result, 4, metric='correlation')
cosine_item = evaluate_rank(M, M_result, 4, method='item')
correlation_item = evaluate_rank(M, M_result, 4, method='item', metric='correlation')

print("Rank accuracy user with cosine metric: ", cosine_user)
print("Rank accuracy user with correlation metric:", correlation_user)
print("Rank accuracy item with cosine metric: ", cosine_item)
print("Rank accuracy item with correlation metric: ", correlation_item)


Rank accuracy user with cosine metric:  0.6477056190747297
Rank accuracy user with correlation metric: 0.56719350585564
Rank accuracy item with cosine metric:  0.6369306393762916
Rank accuracy item with correlation metric:  0.7282177322938193
