In [1]:
import sklearn
import numpy as np
import pandas as pd
import time

In [2]:
train_data = pd.read_csv('./data/small_train_data.csv', index_col=0)
test_data = pd.read_csv('./data/small_test_data.csv', index_col=0)

In [3]:
n_users = train_data.userId.unique().shape[0]
n_items = train_data.movieId.unique().shape[0]

In [4]:
n_users, n_items

(671, 8384)

In [5]:
# construct user-movie matrix (sparse dict representation)
user_based = {}
for row in train_data.itertuples():
    userId, movieId, rating = row[1], row[2], row[3]
    user_dict = user_based.get(userId, dict())
    user_dict[movieId] = rating
    user_based[userId] = user_dict
user_based

{1: {31: 2.5,
  1029: 3.0,
  1061: 3.0,
  1129: 2.0,
  1263: 2.0,
  1287: 2.0,
  1293: 2.0,
  1339: 3.5,
  1343: 2.0,
  1371: 2.5,
  1953: 4.0,
  2105: 4.0,
  2150: 3.0,
  2193: 2.0,
  2294: 2.0,
  2455: 2.5,
  2968: 1.0,
  3671: 3.0},
 2: {10: 4.0,
  17: 5.0,
  39: 5.0,
  47: 4.0,
  50: 4.0,
  62: 3.0,
  110: 4.0,
  144: 3.0,
  150: 5.0,
  153: 4.0,
  161: 3.0,
  165: 3.0,
  168: 3.0,
  185: 3.0,
  186: 3.0,
  208: 3.0,
  222: 5.0,
  223: 1.0,
  225: 3.0,
  235: 3.0,
  248: 3.0,
  253: 4.0,
  261: 4.0,
  265: 5.0,
  266: 5.0,
  272: 3.0,
  273: 4.0,
  292: 3.0,
  296: 4.0,
  300: 3.0,
  317: 2.0,
  319: 1.0,
  339: 3.0,
  349: 4.0,
  350: 4.0,
  356: 3.0,
  357: 3.0,
  364: 3.0,
  367: 3.0,
  370: 2.0,
  371: 3.0,
  377: 3.0,
  410: 3.0,
  454: 4.0,
  457: 3.0,
  468: 4.0,
  474: 2.0,
  480: 4.0,
  485: 3.0,
  497: 3.0,
  500: 4.0,
  508: 4.0,
  509: 4.0,
  515: 4.0,
  527: 4.0,
  539: 3.0,
  551: 5.0,
  552: 3.0,
  585: 5.0,
  586: 3.0,
  587: 3.0,
  588: 3.0,
  589: 5.0,
  590: 5.0,

In [6]:
def dotProduct(d1, d2):
    """
    @param dict d1: a feature vector represented by a mapping from a feature (string) to a weight (float).
    @param dict d2: same as d1
    @return float: the dot product between d1 and d2
    """
    if len(d1) < len(d2):
        return dotProduct(d2, d1)
    else:
        return sum(d1.get(f, 0) * v for f, v in d2.items())

In [20]:
user_similarities = np.zeros((train_data.userId.max(), train_data.userId.max()))

In [21]:
for i in range(train_data.userId.max()+1):
    for j in range(train_data.userId.max()+1):
        if (i in user_based) and (j in user_based):
            user_similarities[i-1,j-1] = dotProduct(user_based[i], user_based[j])

In [22]:
user_similarities

array([[  128. ,     0. ,     0. , ...,    16. ,     0. ,     9. ],
       [    0. ,   897. ,   101.5, ...,    16. ,   119. ,   140. ],
       [    0. ,   101.5,   600.5, ...,    34. ,    67. ,   148. ],
       ..., 
       [   16. ,    16. ,    34. , ...,   389. ,    20. ,    61. ],
       [    0. ,   119. ,    67. , ...,    20. ,   431. ,   205.5],
       [    9. ,   140. ,   148. , ...,    61. ,   205.5,  1665.5]])

In [23]:
# normalize
norms = np.array([np.sqrt(user_similarities.diagonal())])
user_similarities = user_similarities/norms/norms.T

In [24]:
user_similarities

array([[ 1.        ,  0.        ,  0.        , ...,  0.07170347,
         0.        ,  0.0194924 ],
       [ 0.        ,  1.        ,  0.13829718, ...,  0.02708625,
         0.19138685,  0.11454061],
       [ 0.        ,  0.13829718,  1.        , ...,  0.07034733,
         0.13169815,  0.14799018],
       ..., 
       [ 0.07170347,  0.02708625,  0.07034733, ...,  1.        ,
         0.0488446 ,  0.07578491],
       [ 0.        ,  0.19138685,  0.13169815, ...,  0.0488446 ,
         1.        ,  0.24254977],
       [ 0.0194924 ,  0.11454061,  0.14799018, ...,  0.07578491,
         0.24254977,  1.        ]])

In [25]:
overall_mean = train_data.rating.mean()
overall_mean

3.553313639150549

In [26]:
predictions = []
for i in range(len(test_data)):
    test_user, test_movie = test_data.iloc[i][0], test_data.iloc[i][1]
    
    if test_user not in user_based:
        predictions.append(overall_mean)
        continue
        
    sum_rating_similartiy = 0.
    sum_similarity = 0.
    for user,movies in user_based.items():
        if (user==test_user) or (test_movie not in movies):
            continue
        sim = user_similarities[user-1, test_user-1]
        sum_similarity += sim
        sum_rating_similartiy += sim*movies[test_movie]
    if sum_similarity==0:
        user_mean = np.mean(list(user_based[test_user].values()))
        predictions.append(user_mean)
        continue
    pred_rating = sum_rating_similartiy / sum_similarity
    predictions.append(pred_rating)



In [27]:
test_error = np.sqrt(sum((predictions-test_data['rating'].as_matrix())**2)/len(predictions))
print('CF simple model RMSE:', test_error)

CF simple model RMSE: 0.995787050051
