# Lab 8: Recommender System

In this assignment, we will study how to do user-based collaborative filtering and item-based collaborative filtering. 

## 1. Dataset

In this assignment, we will use MovieLens-100K dataset. It includes about 100,000 ratings from 1000 users on 1700 movies.  

In [1]:
from math import sqrt
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import linear_kernel
from sklearn.neighbors import NearestNeighbors


# 1. load data
user_ratings_train = pd.read_csv('./ml-100k/u1.base',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

user_ratings_test = pd.read_csv('./ml-100k/u1.test',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

movie_info =  pd.read_csv('./ml-100k/u.item', 
                          sep='|', names=['movie_id','title'], usecols=[0,1],
                          encoding="ISO-8859-1")

user_ratings_train = pd.merge(movie_info, user_ratings_train)
user_ratings_test = pd.merge(movie_info, user_ratings_test)

# 2. get the rating matrix. Each row is a user, and each column is a movie.
user_ratings_train = user_ratings_train.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')

user_ratings_test = user_ratings_test.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')




user_ratings_train = user_ratings_train.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

user_ratings_test = user_ratings_test.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

print(user_ratings_train.shape)
print(user_ratings_test.shape)

(943, 1664)
(943, 1664)


## Task 1. User-based CF

* Use pearson correlation to get the similarity between different users.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [2]:
#? This approch was too slow
REG_TERM = 0.0001 # to avoid div by 0

def get_sim_score(user_ratings_1, user_ratings_2):
    """
    find the similarity score between `vec1` and `vec2` using Pearson correlation coefficient
    returns a number between -1 and 1 or None if such a score is not possible to be computed
    """
    
    assert user_ratings_1.shape == user_ratings_2.shape, "user rating arrays are not the same shape"
    assert len(user_ratings_1.shape) == 1, "user ratings must be a row vector"
    
    # step 1: get common ratings
    # this generates P, a set of ratings common to both users but in two sets: A, B where
    # A is the set of common ratings given by u1
    # B is the set of common ratings given by u2
    
    _A, _B = [], []
    
    for a, b in np.c_[user_ratings_1, user_ratings_2]:
        if np.isnan(a) or np.isnan(b):
            continue
            
        _A.append(a)
        _B.append(b)
        
    # Step 2 compute score
    A, B = np.array(_A), np.array(_B)
    mean_a, mean_b = np.nanmean(user_ratings_1), np.nanmean(user_ratings_2)
    
    # step 2.1, compute numerator and denominator
    numerator = 0
    denom_left, denom_right = 0, 0

    for a, b in np.c_[A, B]:
        numerator   += (a - mean_a) * (b - mean_b)
        denom_left  += (a - mean_a) ** 2
        denom_right += (b - mean_b) ** 2
        
    # complete all steps
    return numerator / (np.sqrt(denom_left + REG_TERM) * np.sqrt(denom_right + REG_TERM))
        
def build_network(users_data):
    """
    builds a adjency matrix based on function get_sim_score 
    
    returns a 2d np.array where network[i, j] is the similarity between user_i and user_j
    """
    # num users (nearly 900)
    __dim = users_data.shape[0]
    
    
    network = np.zeros((__dim, __dim))
    
    for i in range(__dim):
        for j in range(i, __dim):
            # compute pearson sim between two people
            user_a = users_data[i]
            user_b = users_data[j]
            
            sim_score = get_sim_score(user_a, user_b)
            print(sim_score)
            # record it
            network[i, j] = network[j, i] = sim_score
    return network

In [8]:
# need to replace nan values since NN doesnt support NaN values
mean_ratings_by_user = user_ratings_train.mean(axis=1)
user_ratings_train_nan_filled = user_ratings_train.T.fillna(mean_ratings_by_user).T

# calc pearson value for user
network = user_ratings_train_nan_filled.T.corr(method='pearson').values

# fit Nearest Neighbors with network data
NearestNeighborsModel = NearestNeighbors(n_neighbors=5).fit(network)
neighbors_distance, neighbors_ind = NearestNeighborsModel.kneighbors()





(array([[1.82210603, 1.82315166, 1.8453587 , 1.85445431, 1.86752731],
        [1.79348311, 2.03267231, 2.09907993, 2.15566404, 2.17168512],
        [2.19775037, 2.22105738, 2.27585726, 2.28638963, 2.29474009],
        ...,
        [1.96094157, 2.24777056, 2.25003148, 2.26817103, 2.30781886],
        [2.08757611, 2.16821242, 2.17778076, 2.18709838, 2.22735997],
        [1.74872221, 1.75528182, 1.7959616 , 1.80103684, 1.80128025]]),
 array([[885, 845, 290, 775,  12],
        [650, 122, 369, 295,  33],
        [687, 184, 278, 807, 171],
        ...,
        [340, 755, 304, 368, 107],
        [933, 404, 806, 397, 372],
        [845, 290, 726, 708, 452]]))

## Task 2. Item-based CF
* Use cosine similarity to get the similarity between different items.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [4]:
user_ratings_test

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,,,,,,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,,,,...,,,,,,,,,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,,,,,,,,...,,,,,,,,,,
941,,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,
