# Lab 8: Recommender System

In this assignment, we will study how to do user-based collaborative filtering and item-based collaborative filtering. 

## 1. Dataset

In this assignment, we will use MovieLens-100K dataset. It includes about 100,000 ratings from 1000 users on 1700 movies.  

In [1]:
from math import sqrt
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import linear_kernel
from sklearn.neighbors import NearestNeighbors


# 1. load data
user_ratings_train = pd.read_csv('./ml-100k/u1.base',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

user_ratings_test = pd.read_csv('./ml-100k/u1.test',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

movie_info =  pd.read_csv('./ml-100k/u.item', 
                          sep='|', names=['movie_id','title'], usecols=[0,1],
                          encoding="ISO-8859-1")

user_ratings_train = pd.merge(movie_info, user_ratings_train)
user_ratings_test = pd.merge(movie_info, user_ratings_test)

# 2. get the rating matrix. Each row is a user, and each column is a movie.
user_ratings_train = user_ratings_train.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')

user_ratings_test = user_ratings_test.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')




user_ratings_train = user_ratings_train.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

user_ratings_test = user_ratings_test.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

print(user_ratings_train.shape)
print(user_ratings_test.shape)

(943, 1664)
(943, 1664)


## Task 1. User-based CF

* Use pearson correlation to get the similarity between different users.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [2]:
# need to replace nan values since NN doesnt support NaN values
mean_ratings_by_user = user_ratings_train.mean(axis=1)
user_ratings_train_nan_filled = user_ratings_train.T.fillna(mean_ratings_by_user).T # fill magic i found on SOF

# calc pearson value for user
network = user_ratings_train_nan_filled.T.corr(method='pearson').values

# fit Nearest Neighbors with network data
USER_CF_NEIGHBOR_COUNT = 10
NearestNeighborsModel = NearestNeighbors(n_neighbors=USER_CF_NEIGHBOR_COUNT).fit(network)

# run NN on the self dataset signified by X=None
neighbors_distance, neighbors_ind = NearestNeighborsModel.kneighbors(X=None)            

In [3]:
from sklearn.metrics import mean_absolute_error

# preprare train and test matrices
user_data_train = user_ratings_train_nan_filled.values
user_data_test  = user_ratings_test.values

# input for mean_absolute_error
truth, pred  = [], []

# loop over each value of the test set
for user_id, user_ratings in enumerate(user_data_test):
    for video_id, video_rating in enumerate(user_ratings):
        # ignore null test ratings
        if np.isnan(user_data_test[user_id, video_id]): continue
        
        # get the neighbors of current user to predict
        neighbors = neighbors_ind[user_id]
        
        # get the ratings given by the neighbors via train
        neighbor_ratings = user_data_train[neighbors]
        
        # get rating for the video
        video_ratings = neighbor_ratings[:, video_id]
        
        # get biases for each user
        biases    = mean_ratings_by_user.values[neighbors]
        self_bias = mean_ratings_by_user.values[user_id]
        
        # get simarity for each user
        sim_scores = network[user_id][neighbors]
        
        # compute full score
        score = self_bias + (np.sum((np.multiply(sim_scores, video_ratings - biases))) / np.sum(sim_scores))
        
        # save to compute error later
        truth.append(user_data_test[user_id, video_id])
        pred.append(score)
        

MAE = mean_absolute_error(truth, pred)
print(f'MAE for User-based CF is {MAE} with nneighbors={USER_CF_NEIGHBOR_COUNT}')  

MAE for User-based CF is 0.8051374780803943 with nneighbors=10


## Task 2. Item-based CF
* Use cosine similarity to get the similarity between different items.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [4]:
from sklearn.metrics.pairwise import pairwise_distances

# transpose to give features to movies to use for pred
items_data_train = user_ratings_train.T

# fill averages with the mean rating received for that movie. 
item_data_train_mean_filled = items_data_train.T.fillna(items_data_train.mean(axis=1)).T

# remove rows with no data
movies_no_body_watched = []
num_entries_still_missing = item_data_train_mean_filled.isna().sum(axis=1)

# scan for movies with no ratings
for movie_name, num_missing in num_entries_still_missing.items():
    if num_missing: movies_no_body_watched.append(movie_name)
        
# drop them from the model
item_data_train_mean_filled_valid_movies = item_data_train_mean_filled.drop(movies_no_body_watched)
print(f"Item data shape after dropping movies nobody watched: {item_data_train_mean_filled_valid_movies.shape}")

# use cosine_similarity from sklearn and feed it to pandas to perform corr
network = pairwise_distances(item_data_train_mean_filled_valid_movies, metric='cosine')

# create model based on corrs between movies
ITEM_FC_NEIGHBOR_COUNT = 10
ItemNearestNeighborsModel = NearestNeighbors(n_neighbors=ITEM_FC_NEIGHBOR_COUNT).fit(item_data_train_mean_filled_valid_movies.values)
dists, nbrs = ItemNearestNeighborsModel.kneighbors()

Item data shape after dropping movies nobody watched: (1633, 943)


In [5]:
# full dataset
item_data_test = user_ratings_test.T

# dataset with movies nobody as rated removed
item_data_test_valid_movies = item_data_test.drop(movies_no_body_watched).values

# ref to a .values numpy array for speed
item_data_train_mean_filled_valid_movies_values = item_data_train_mean_filled_valid_movies.values

# input for mean_absolute_error
truth, pred  = [], []

# loop over each value of the test set
for video_id, test_user_ratings in enumerate(item_data_test_valid_movies):
    
    # get item neighbors of given movie
    neighbors = nbrs[video_id] # this is the neigbhor hood of similar movies
    
    for user_id, truth_val in enumerate(test_user_ratings):
        # pass over null test values
        if np.isnan(item_data_test_valid_movies[video_id, user_id]): continue
            
        # save truth value
        truth.append(truth_val)
        
        # get ratings fron neighbors
        given_ratings = item_data_train_mean_filled_valid_movies_values[neighbors, user_id]
        
        # get correlations
        sim_scores = network[video_id][neighbors]
        
        # get user ratings for neighbors
        user_ratings = item_data_train_mean_filled_valid_movies_values[neighbors, user_id]
        
        # calc pred
        sum_scores = np.sum(sim_scores)
        
        # if the sum of scores (denom) is non zero assign a score other wise just assign average of user ratings
        if sum_scores:
            rating_pred = np.sum(np.multiply(sim_scores, user_ratings)) / sum_scores
        else:
            rating_pred = mean_ratings_by_user[user_id]
        
        pred.append(rating_pred)

In [6]:
MAE = mean_absolute_error(truth, pred)
print(f'MAE for Item-based CF is {MAE} with nneighbors={ITEM_FC_NEIGHBOR_COUNT}')  

MAE for Item-based CF is 0.8257884888390413 with nneighbors=10


# 