# Install and load necesary packages

In [1]:
# Please don't change this cell

import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Please don't change this cell
df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

df.head(20)

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


# Split dataset
## Random Train and Test Split

In [3]:
from sklearn.model_selection import train_test_split

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

train_df, test_df = train_test_split(df, test_size=0.2, random_state = 10)
train_df, test_df

# Training Dataset
train_ds = np.zeros((n_users, n_items))
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
#train_ds = pd.DataFrame(train_ds)

# Testing Dataset
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    test_ds[row[1]-1, row[2]-1] = row[3]
#test_ds = pd.DataFrame(test_ds)

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

943 users
1682 items
Construct the rating matrix based on train_df:
[[0. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]
Construct the rating matrix based on test_df:
[[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Utils

In [4]:
# Please don't change this cell
EPSILON = 1e-9

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Your Solution

In [5]:
# Write your code here
# You are required to implement the required solution here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 

MAE = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
RMSE = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.



In [6]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0, RMSE: 0


In [7]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0, RMSE: 0


In [8]:
import numpy as np
import pandas as pd

# Calculate the centered cosine similarity between users
n_users = train_ds.shape[0]
cosine_matrix = np.zeros((n_users, n_users))
EPSILON = 1e-9

for i, user_i_vec in enumerate(train_ds):
    for j, user_j_vec in enumerate(train_ds):

        # ratings corated by the current pair of items
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0

        # corrated index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of item_i_vec and item_j_vec
        mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
        mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr
        user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
        user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

        r_ui_sub_ri_sq = np.square(user_i_sub_mean)
        r_uj_sub_rj_sq = np.square(user_j_sub_mean)

        r_ui_sub_ri_sq_sum_sqrt = np.sqrt(np.sum(r_ui_sub_ri_sq))
        r_uj_sub_rj_sq_sum_sqrt = np.sqrt(np.sum(r_uj_sub_rj_sq))

        sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sub_ri_sq_sum_sqrt * r_uj_sub_rj_sq_sum_sqrt + EPSILON)

        # significance weighting
        DELTA = 60
        weighted_sim = (min(len(corrated_index), DELTA) / DELTA) * sim

        cosine_matrix[i][j] = weighted_sim

cosine_matrix


# create the matrices
# rating matrix (contains the actual rating for every user, to every move they have rated)

ratings = train_ds
ratingsBinary = np.zeros((n_users, n_items))

for row in train_df.itertuples():
    # rating matrix [user_id_index(row), item_id_index(column)] = actual rating
    ratings[row[1] - 1, row[2] - 1] = row[3]
    # Popularity matrix [user_id_index(row), item_id_index(column)] = 1
    ratingsBinary[row[1] - 1, row[2] - 1] = 1

ratings_df = pd.DataFrame(ratings)

# Calculations
# sum of all columns per row
userrate_rate = ratingsBinary.sum(axis=0)  # how many movies each user watched / rated
ratings_sum = ratings.sum(axis=0)  # total sum of all ratings provided by each user

ratings_userAvg = ratings_sum / userrate_rate

# Normalise the ratings
# subtract the average rating for each user
ratings_subtractMean = np.zeros((n_users, n_items))
for row in train_df.itertuples():
    ratings_subtractMean[row[1] - 1, row[2] - 1] = row[3] - ratings_userAvg[row[1] - 1]

ratings_subtractMean_df = pd.DataFrame(ratings_subtractMean)

dev_ji = np.zeros((n_items, n_items))
lambda_value = 0.6  # Adjust the value of lambda as desired

for i in range(n_items):
    for j in range(n_items):
        if i == j:
            continue

        # Find users who have rated both items i and j
        rated_users = np.where((ratingsBinary[:, i] == 1) & (ratingsBinary[:, j] == 1))[0]

        if len(rated_users) > 0:
            deviation_sum1 = np.sum(
                (ratings[rated_users, j] - ratings[rated_users, i]) / len(rated_users))
            deviation_sum2 = np.sum(
                (ratings[rated_users, j] - ratings[rated_users, i]) * np.exp(cosine_matrix[:, rated_users]))
            deviation_sum2 /= np.sum(np.exp(cosine_matrix[:, rated_users])) * len(rated_users)

            dev_ji[i, j] = lambda_value * deviation_sum1 + (1 - lambda_value) * deviation_sum2

dev_ji_df = pd.DataFrame(dev_ji)
dev_ji_df


# Function to calculate predictions for all users
def prediction(ratings, dev_ji, lambda_val):
    n_users = ratings.shape[0]
    n_items = ratings.shape[1]
    prediction_forall = np.zeros((n_users, n_items))
    EPSILON = 1e-9

    for user_index in range(n_users):
        user_items = np.where(ratings[user_index] > 0)[0]  # Get the indices of items rated by the user

        predictions = np.zeros(n_items)

        for j in range(n_items):
            if j in user_items:  # Check if item j is rated by the active user u′
                continue

            numerator = 0
            denominator = 0

            for i in user_items:
                dev_ji_val = dev_ji[i, j]  # Average deviation between item j and item i
                user_rating = ratings[user_index, i]  # Rating of item i by the active user u′
                cj_i = np.sum((ratings[:, i] > 0) & (ratings[:, j] > 0))  # Number of users who rated item i and j

                numerator += (dev_ji_val + user_rating) * cj_i
                denominator += cj_i

            if denominator != 0:
                predictions[j] = numerator / (denominator + EPSILON)

        prediction_forall[user_index] = predictions

    return prediction_forall

prediction_forall = prediction(ratings, dev_ji, lambda_value)
prediction_forall_df = pd.DataFrame(prediction_forall)

# Please don't change this cell
EPSILON = 1e-9

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Calculate MAE and RMSE
MAE, RMSE = evaluate(test_ds, prediction_forall)


print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}".format(MAE, RMSE))


MAE: 0.7521237744120542, RMSE: 0.95928903880955
