# Install and load necesary packages

In [1]:
# Please don't change this cell

import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

## Load the dataset using pandas

In [2]:
# Please don't change this cell

df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

# obtain top 500 users and top 500 items
user_ids = df.groupby('user_id').count().sort_values(by='rating', ascending=False).head(500).index
item_ids = df.groupby('item_id').count().sort_values(by='rating', ascending=False).head(500).index
df = df[(df['user_id'].isin(user_ids)) & (df['item_id'].isin(item_ids))]

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
1,186,302,3,891717742
3,244,51,2,880606923
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467


# Split dataset

## Randomly select one rating from each user as test set

In [3]:
# Please don't change this cell

# remap user and item ID
df['user_id'] = df.groupby('user_id').ngroup()
df['item_id'] = df.groupby('item_id').ngroup()

test_df = df.groupby('user_id').sample(1, random_state=1024)
train_df = df[~df.index.isin(test_df.index)]

In [4]:
# Please don't change this cell

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
avg_num = df.groupby('user_id').size().mean()
density = df.shape[0] / (n_users * n_items)
min_ratings = df.rating.min()
max_ratings = df.rating.max()

print("The number of users: {}" .format(n_users))
print("The number of items: {}" .format(n_items))
print("Avg. # of rated Items/User: {}" .format(avg_num))
print("Density of data: {}" .format(density))
print("Ratings Range: {} - {}" .format(min_ratings, max_ratings))

The number of users: 500
The number of items: 500
Avg. # of rated Items/User: 129.914
Density of data: 0.259828
Ratings Range: 1 - 5


In [5]:
# Please don't change this cell

# Convert the format of datasets to matrices
# Train dataset
df_zeros = pd.DataFrame({
    'user_id': np.tile(np.arange(0, n_users), n_items), 
    'item_id': np.repeat(np.arange(0, n_items), n_users), 
    'rating': 0})
train_ds = df_zeros.merge(train_df, 
                          how='left', 
                          on=['user_id', 'item_id']).fillna(0.).pivot_table(
                              values='rating_y', 
                              index='user_id', 
                              columns='item_id').values
                           
# Test dataset
test_ds = df_zeros.merge(test_df, 
                         how='left', 
                         on=['user_id', 'item_id']).fillna(0.).pivot_table(
                             values='rating_y', 
                             index='user_id', 
                             columns='item_id').values

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

Construct the rating matrix based on train_df:
[[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [4. 3. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 4. 0.]]
Construct the rating matrix based on test_df:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Utils

In [6]:
# Please don't change this cell
EPSILON = 1e-9

def user_corr(imputed_train_ds):
    '''
    Function for calculating user's similarity
    '''
    active_user_pearson_corr = np.zeros((imputed_train_ds.shape[0], imputed_train_ds.shape[0]))

    # Compute Pearson Correlation Coefficient of All Pairs of Users between active set and training dataset
    for i, user_i_vec in enumerate(imputed_train_ds):
        for j, user_j_vec in enumerate(imputed_train_ds):

            # ratings corated by the current pair od users
            mask_i = user_i_vec > 0
            mask_j = user_j_vec > 0

            # corrated item index, skip if there are no corrated ratings
            corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
            if len(corrated_index) == 0:
                continue

            # average value of user_i_vec and user_j_vec
            mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
            mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

            # compute pearson corr
            user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
            user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

            r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
            r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

            r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
            r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

            sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)
            active_user_pearson_corr[i][j] = sim

    return active_user_pearson_corr

def predict(test_ds, imputed_train_ds, user_corr, k=20):
    '''
    Function for predicting ratings in test_ds
    '''

    # Predicting ratings of test set
    predicted_ds = np.zeros_like(test_ds)

    for (i, j), rating in np.ndenumerate(test_ds):

        if rating > 0:

            # only predict ratings on test set
            sim_user_ids = np.argsort(user_corr[i])[-1:-(k + 1):-1]

            #==================user-based==================#
            # the coefficient values of similar users
            sim_val = user_corr[i][sim_user_ids]

            # the average value of the current user's ratings
            sim_users = imputed_train_ds[sim_user_ids]
            
            mask_rateditem_user = imputed_train_ds[i] != 0
            num_rated_items = mask_rateditem_user.astype(np.float32)
            user_mean = np.sum(imputed_train_ds[i, mask_rateditem_user]) / (num_rated_items.sum() + EPSILON)

            mask_nei_rated_items = sim_users != 0
            num_rated_per_user = mask_nei_rated_items.astype(np.float32)
            num_per_user = num_rated_per_user.sum(axis=1)

            sum_per_user = sim_users.sum(axis=1)
            sim_user_mean = sum_per_user / (num_per_user + EPSILON)
            
            mask_rated_j = sim_users[:, j] > 0
                            
            # sim(u, v) * (r_vj - mean_v)
            sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
            
            user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)

            predicted_ds[i, j] = np.clip(user_based_pred, 0, 5)
            
    return predicted_ds

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Baseline - KNN based recommendation (Similarity Metric: Pearson Correlation Coefficient)

In [7]:
# Please don't change this cell

user_pearson_corr = user_corr(train_ds)
predicted_ds = predict(test_ds, train_ds, user_pearson_corr, k=20)

In [8]:
# Please don't change this cell

MAE, RMSE = evaluate(test_ds, predicted_ds)

print("===================== Baseline Result =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.8471711011333851, RMSE: 1.092846045041526


# Your Solution
(Put all your implementation for your solution in the following cell only)

In [9]:
# Write your code here
# You are required to implement the existing solution in the given report here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 

# A. Item's Popularity Computation
# - Calculating number of times each item has been rated
P = np.sum(np.count_nonzero(train_ds, axis=0))

# B. User's Similarity Computation and KNN Selection
# creating a matrix for item similarity
user_corr = np.zeros((n_items, n_items))

for a, user_a_vec in enumerate(train_ds):
    for u, user_u_vec in enumerate(train_ds):
        # mask returns true if rating is there (so greater than 0)
        mask_a = user_a_vec > 0
        mask_u = user_u_vec > 0

#   a) Ratings data making up
#      STEP 1:
#      - Calculating the union set of items voted by user a or user b
#      - Stores the index where either user a or user u has rated the item
        union_set = np.union1d(np.where(mask_a), np.where(mask_u))
    
        # if there isn't at least one item rated by either user a or user u, move on to the next
        if len(union_set) == 0:
            continue

#      STEP 2:
#      - For each item in the union set, if user a or user u haven't rated the item, predict
#        the missing value as the average rating of the corresponding user

#      - average ratings by of user_a_vec and user_u_vec
        mean_user_a = np.sum(user_a_vec) / (np.sum(np.clip(user_a_vec, 0, 1)) + EPSILON)
        mean_user_u = np.sum(user_u_vec) / (np.sum(np.clip(user_u_vec, 0, 1)) + EPSILON)

        # if value is not rated from union set, replacing it with the corresponding average rating
        for index in union_set:
            if user_a_vec[index] == 0:
                user_a_vec[index] = mean_user_a
            if user_u_vec[index] == 0:
                user_u_vec[index] = mean_user_u
        
        # new vector with only the items in the union set and the replaced missing values
        new_user_a_vec = user_a_vec[union_set]
        new_user_u_vec = user_u_vec[union_set]

#   b) Similarity computation utilizing item’s popularity
#      - w = log(m/P) , where m = number of users
        m = n_users
        w = np.log(m / P)
        
        # calculating number for the formula
        numerator = np.sum(np.square(w) * (new_user_a_vec - mean_user_a) * (new_user_u_vec - mean_user_u))
        
        # caculating the square values for the denominator
        denominator_a = np.sqrt(np.sum(np.square(w) * np.square(new_user_a_vec - mean_user_a)))
        denominator_u = np.sqrt(np.sum(np.square(w) * np.square(new_user_u_vec - mean_user_u)))
        
        # multiplying both values to get final d
        denominator = denominator_a * denominator_u  + EPSILON
        
        # finding the similarity
        sim_a_u = numerator / denominator
        
        # adding similarity to the matrix
        user_corr[a][u] = sim_a_u

        
# creating matrix to store the prediction ratings
prediction_ratings = np.zeros_like(test_ds)

k = 20
for (a, u), rating in np.ndenumerate(test_ds):
#   c) K-Nearest Neighbours Selection
       # finding the similar items based on the k value
       # and sorts in descending order
       # gets the user ids
       # removed the user itself (as user is most similar to itself)
    knn_sim_users = np.argsort(user_corr[a])[-1:-(k + 1):-1]
    
    
#   C. Prediction and Recommendation for Active Users
#      - Calculate predictions based on the formula provided in the report
    # uses the similar items to calculate the coefficient values
    sim_values = user_corr[a][knn_sim_users]
    
    # calculates the mean for the rating of the current user
    sim_users = train_ds[knn_sim_users]
    
    # creates mask for items user a based on if they have rated an item or not
    mask_rateditem_user = train_ds[a] != 0
    # calculates number of items rated (first converts boolean to floats - True = 1.0, False = 0.0)
    num_rated_items = mask_rateditem_user.astype(np.float32).sum()
    # calculates the mean rating for the user
    user_mean = np.sum(train_ds[a, mask_rateditem_user]) / (num_rated_items + EPSILON)
    
    # created mask based on whether the mean rating for similar users is a zero or not
    mask_nei_rated_items = sim_users != 0
    # converts boolean values to floats (True = 1.0, False = 0.0)
    # then, calculates the number of items rated by the user
    num_per_user = mask_nei_rated_items.astype(np.float32).sum(axis = 1)
  
    
    # number of similar users per user
    sum_per_user = sim_users.sum(axis=1)
    # mean number of users
    sim_user_mean = sum_per_user / (num_per_user + EPSILON)
    
    mask_rated_u = sim_users[:, u] > 0
                            
    # sim(a, u) * (R(u, i) - mean of R(u))
    sim_r_sum_mean = sim_values[mask_rated_u] * (sim_users[mask_rated_u, u] - sim_user_mean[mask_rated_u])
            
    # calculated prediction (mean + sim(a, u) * R(u, i) - mean of R(u)) / sum(a,u))
    user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_values[mask_rated_u]) + EPSILON)
    
    # adds prediction to rating
    # as prediction needs to be between 0 and 5 and value smaller than 0 gets a 0 prediction
    # and any value greater than 5 gets a 5 prediction
    prediction_ratings[a, u] = np.clip(user_based_pred, 0, 5)
    
# use the evaluate function provided to get the MAE and RMSE values
MAE, RMSE = evaluate(test_ds, prediction_ratings)




## Print the MAE and RMSE of Your Implementation

In [10]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.7678874649435112, RMSE: 0.9650219034614862
