# Lab 10: Recommender System

In this assignment, we will study how to do user-based collaborative filtering and item-based collaborative filtering. 

## 1. Dataset

In this assignment, we will use MovieLens-100K dataset. It includes about 100,000 ratings from 1000 users on 1700 movies.  

In [265]:
from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import linear_kernel
from sklearn.neighbors import NearestNeighbors


# 1. load data
user_ratings_train = pd.read_csv('./ml-100k/u1.base',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

user_ratings_test = pd.read_csv('./ml-100k/u1.test',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

movie_info =  pd.read_csv('./ml-100k/u.item', 
                          sep='|', names=['movie_id','title'], usecols=[0,1],
                          encoding="ISO-8859-1")

user_ratings_train = pd.merge(movie_info, user_ratings_train)
user_ratings_test = pd.merge(movie_info, user_ratings_test)

# 2. get the rating matrix. Each row is a user, and each column is a movie.
user_ratings_train = user_ratings_train.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')

user_ratings_test = user_ratings_test.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')




user_ratings_train = user_ratings_train.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

user_ratings_test = user_ratings_test.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

print(user_ratings_train.shape)
print(user_ratings_test.shape)

(943, 1664)
(943, 1664)


In [266]:
print(type(user_ratings_train))
user_ratings_train[:10]
# tem_matrix = user_ratings_train.to_numpy()
# print(len(tem_matrix))
# print(len(tem_matrix[0]))
# size [0 to 942] [0 to 1663]
# print(tem_matrix[0][0])
# print(tem_matrix[942][1663])
# print(tem_matrix[943][1664])

<class 'pandas.core.frame.DataFrame'>


title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,5.0,,,3.0,4.0,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,4.0,,,...,,,,4.0,,,,,,
6,,,,4.0,,,,,,,...,,,,,,,,,,
7,,,,4.0,,,5.0,5.0,,4.0,...,,,,,,,3.0,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,4.0,...,,,,,,,,,,
10,,,,5.0,,,,,,,...,,,,,,,,,,


In [267]:
print(type(user_ratings_test))
user_ratings_test[:10]

<class 'pandas.core.frame.DataFrame'>


title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,,,,,,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,,,,...,,,,,,,,,4.0,
6,,,,,,,,5.0,,,...,,,,4.0,,,,,,
7,,,,,,,,,,,...,,,,5.0,3.0,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,5.0,,4.0,...,,,,,,,,,,


## Task 1. User-based CF

* Use pearson correlation to get the similarity between different users.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [268]:
# your code
# The columns contain each movie
# The rows contain each user
n = 5
df = user_ratings_train.T.corr()
df = df.fillna(-10) 
#Set the corr score between the same user to 0
for i in range(1, df.shape[0]+1):
    df[i][i] = -10



means = user_ratings_train.mean(axis=1)
user_ratings_train = user_ratings_train.fillna(-10)
train_matrix = user_ratings_train.to_numpy()

movie_names = user_ratings_test.columns
base_score = means[1]

prediction_matrix = []
tem_pred_list = []


for id in range(1, 944):
    result = df[id].values.argsort()
    similar_users=result[-n:]
    weighted_score = 0
    corr_values_sum = 0
    base_score = means[id]
    for movie_index in range(1664):
        for user in similar_users:
            corr_value = df.iloc[id-1, user] 
            mean_user_score = means[user+1] 
            if(user_ratings_train[movie_names[movie_index]][user+1]!=-10):
                weighted_score += corr_value * (user_ratings_train[movie_names[movie_index]][user+1]-mean_user_score)
            corr_values_sum += corr_value
            final_scores = base_score + weighted_score/corr_values_sum
        tem_pred_list.append(final_scores)
        weighted_score=0
        corr_values_sum=0
    prediction_matrix.append(tem_pred_list)
    tem_pred_list = []




#### The cell below is just for testing purpose, checking different index boundaries

In [315]:

# print(df)
# size [0 to 942] [0 to 942]
# print(df.iloc[942, 942])

# user_ratings_train[movie_name][] size [1 to 943]
# user_ratings_train[movie_names[3]][1]

# size 1 to 943
# df[943]

In [286]:
# Storing the prediction matrix for task 1 just in case if my notebook shutdowns unexpectly
%store prediction_matrix

Stored 'prediction_matrix' (list)


In [317]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error

# Turn the testing data to a numpy array
test_matrix = user_ratings_test.to_numpy()
# Get the indices that contains finite numbers
non_nan_indices_in_test = np.where(~np.isnan(test_matrix))
# Turn the prediction_matrix (943 by 1664 list) to a numpy array
prediction_numpy_array = np.array(prediction_matrix)

print("The finite ratings in the test set/ground truth: ", test_matrix[non_nan_indices_in_test])
print("The finite ratings in the predicted set: ", prediction_numpy_array[non_nan_indices_in_test])
print("The mae for task1 User_Based CF is {}".format(mae(test_matrix[non_nan_indices_in_test], prediction_numpy_array[non_nan_indices_in_test])))

The finite ratings in the test set/ground truth:  [2. 3. 3. ... 3. 3. 5.]
The finite ratings in the predicted set:  [3.67164179 3.67164179 3.67164179 ... 3.38461538 3.42424242 3.96875   ]
The mae for task1 User_Based CF is 0.8511030455580283


## Task 2. Item-based CF
* Use cosine similarity to get the similarity between different items.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [345]:
# your code
# TA mercy: we can use pearson correlation instead of cosine similarity to make the implementation easier
# use train set to compute similarity score and perform the corresponding formula
# compute the MAE between the prediction built upon the train set and the actual rating in test set

# Reload the test to avoid variable conflicts in the previous cells
# 1. load data
user_ratings_train = pd.read_csv('./ml-100k/u1.base',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

user_ratings_test = pd.read_csv('./ml-100k/u1.test',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

movie_info =  pd.read_csv('./ml-100k/u.item', 
                          sep='|', names=['movie_id','title'], usecols=[0,1],
                          encoding="ISO-8859-1")

user_ratings_train = pd.merge(movie_info, user_ratings_train)
user_ratings_test = pd.merge(movie_info, user_ratings_test)

# 2. get the rating matrix. Each row is a user, and each column is a movie.
user_ratings_train = user_ratings_train.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')

user_ratings_test = user_ratings_test.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')




user_ratings_train = user_ratings_train.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

user_ratings_test = user_ratings_test.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )


In [366]:
n = 5
df2 = user_ratings_train.corr()
df2 = df2.fillna(-10) 
#Set the corr score between the same iteam to 0
for i in range(1664):
    df2.iloc[i, i]=-10

user_ratings_train = user_ratings_train.fillna(-10)
train_matrix = user_ratings_train.to_numpy()

movie_names = user_ratings_train.columns
prediction_matrix2 = []
tem_pred_list2 = []

# Get the top 5 similar ratings and iterate through for each movie in each user
for x in range(943):
    for y in range(1664):
        result = df2.iloc[y].values.argsort()
        similar_items=result[-n:]
        weighted_score = 0
        corr_values_sum = 0
        for item in similar_items:
            corr_value = df2.iloc[y, item] 
            # if the the user did not rate such movie, then just skip it
            if(user_ratings_train.iloc[x, item]!=-10):
                weighted_score += corr_value * (user_ratings_train.iloc[x, item])
            corr_values_sum += corr_value
            final_scores = base_score + weighted_score/corr_values_sum
        tem_pred_list2.append(final_scores)
        weighted_score=0
        corr_values_sum=0
    prediction_matrix2.append(tem_pred_list2)
    tem_pred_list2 = []


In [381]:
%store prediction_matrix2

Stored 'prediction_matrix2' (list)


In [385]:
from sklearn.metrics import mean_absolute_error


mae = mean_absolute_error

# Turn the testing data to a numpy array
test_matrix = user_ratings_test.to_numpy()
# Get the indices that contains finite numbers
non_nan_indices_in_test = np.where(~np.isnan(test_matrix))
# Turn the prediction_matrix (943 by 1664 list) to a numpy array
prediction2_numpy_array = np.array(prediction_matrix2)

print("The finite ratings in the test set/ground truth: ", test_matrix[non_nan_indices_in_test])
print("The finite ratings in the predicted set: ", prediction_numpy_array[non_nan_indices_in_test])
print("The mae for task2 Item_Based CF is {}".format(mae(test_matrix[non_nan_indices_in_test], prediction2_numpy_array[non_nan_indices_in_test])))


The finite ratings in the test set/ground truth:  [2. 3. 3. ... 3. 3. 5.]
The finite ratings in the predicted set:  [3.67164179 3.67164179 3.67164179 ... 3.38461538 3.42424242 3.96875   ]
The mae for task2 Item_Based CF is 1.302001500312819
