# Install and load necesary packages

In [2]:
# Please don't change this cell

import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Please don't change this cell
df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


# Split dataset
## Random Train and Test Split

In [4]:
# please do not change this cell

from sklearn.model_selection import train_test_split

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

train_df, test_df = train_test_split(df, test_size=0.2, random_state = 10)
train_df, test_df

# Training Dataset
train_ds = np.zeros((n_users, n_items))
item_popularity = np.zeros(n_items)
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
    item_popularity[row[2]-1] =  item_popularity[row[2]-1] + 1
#train_ds = pd.DataFrame(train_ds)

# Testing Dataset
testsize = 0
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    if item_popularity[row[2]-1] > 30:
        test_ds[row[1]-1, row[2]-1] = row[3]
        testsize = testsize + 1
#test_ds = pd.DataFrame(test_ds)

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

print("Testsize = " + str(testsize))

943 users
1682 items
Construct the rating matrix based on train_df:
[[0. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]
Construct the rating matrix based on test_df:
[[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Testsize = 17678


# Utils

In [5]:
# Please don't change this cell
# you can use this devaluate Utils here, and you can also implement your own MAE and RMSE calculation. 

EPSILON = 1e-9

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Your Solution

In [6]:
# Write your code here
# You are required to implement the required solution here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 
# Adjusted Euclidean Distance calculation
def adjusted_euclidean_distance_item(item1, item2, ratings_matrix, V_max, V_min):
    # Find the common users who have rated both item1 and item2
    common_users = ratings_matrix.loc[:, item1].dropna().index.intersection(ratings_matrix.loc[:, item2].dropna().index)
    # Calculate the number of common users
    m = len(common_users)
    # If there are no common users, return 0
    if m == 0:
        return 0
    # Calculate the Euclidean distance between the ratings of item1 and item2 by the common users
    dist = np.sqrt(np.sum((ratings_matrix.loc[common_users, item1] - ratings_matrix.loc[common_users, item2])**2))
    # Calculate the maximum possible Euclidean distance, which would occur if all common users rated one item with V_max and the other item with V_min
    dist_max = np.sqrt(m * (V_max - V_min)**2)
    # Return the Adjusted Euclidean Distance, which is 1 minus the ratio of the actual distance to the maximum possible distance
    return 1 - (dist / dist_max)

# Set the maximum and minimum possible ratings
V_max = 5
V_min = 1
# Initialize a matrix to store the Adjusted Euclidean Distance between each pair of items
np_item_aed_corr = np.zeros((n_items, n_items))
# Convert the training dataset into a DataFrame
train_ds_df = pd.DataFrame(train_ds)

# Loop over each pair of items
for i in range(n_items):
    for j in range(n_items):
        # Skip the case where the two items are the same
        if i != j:
            # Calculate the Adjusted Euclidean Distance between item i and item j
            sim = adjusted_euclidean_distance_item(i, j, train_ds_df, V_max, V_min)
            # Store the Adjusted Euclidean Distance in the matrix
            np_item_aed_corr[i][j] = sim
            
# display the matrix of Adjusted Euclidean Distances
np_item_aed_corr


Adjusted Euclidean Distance similarity matrix:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.0,0.425315,0.413839,0.417526,0.406535,0.385844,0.406144,0.395087,0.374987,0.400867,...,0.393664,0.392299,0.392681,0.392954,0.393664,0.393172,0.392681,0.392954,0.393992,0.393172
1,0.425315,0.0,0.677525,0.633198,0.680208,0.692141,0.453877,0.550392,0.470827,0.628262,...,0.711019,0.710217,0.711019,0.711593,0.711019,0.712053,0.711019,0.711593,0.71309,0.712053
2,0.413839,0.677525,0.0,0.598271,0.700876,0.748414,0.450791,0.540261,0.478333,0.646822,...,0.770167,0.769159,0.770167,0.770889,0.770167,0.771468,0.770167,0.770889,0.770167,0.771468
3,0.417526,0.633198,0.598271,0.0,0.614434,0.599179,0.456432,0.54497,0.484017,0.567447,...,0.613318,0.612719,0.615898,0.615467,0.614348,0.61409,0.613318,0.613747,0.614864,0.61409
4,0.406535,0.680208,0.700876,0.614434,0.0,0.745273,0.44479,0.533108,0.484339,0.639577,...,0.771033,0.770022,0.771033,0.771758,0.771033,0.77234,0.771033,0.771758,0.771033,0.77234


In [10]:
# Initialize a matrix to store the predicted ratings
np_predictions = np.zeros((n_users, n_items))
# Set the number of nearest neighbors to consider
K = 89

# Function to predict ratings using AED similarity
def predict_ratings(train_ds, np_item_aed_corr, test_ds, K):
    # Get the number of users and items
    n_users, n_items = train_ds.shape
    # Initialize a matrix to store the predicted ratings
    np_predictions = np.zeros((n_users, n_items))
    # Convert the training dataset into a DataFrame
    train_ds_df = pd.DataFrame(train_ds)

    # Loop over each user-item pair in the test dataset
    for (i, j), rating in np.ndenumerate(test_ds):
        # If the user has rated the item
        if rating > 0:
            # Get the IDs of the K most similar items to item j based on AED
            sim_item_ids = np.argsort(np_item_aed_corr[j])[-(K + 1):-1]
            # Get the AEDs of the K most similar items to item j
            sim_val = np_item_aed_corr[j][sim_item_ids]
            # Get the ratings of the K most similar items
            sim_items = train_ds_df.values[:, sim_item_ids].T
            # Calculate the mean rating of item j
            item_mean = np.sum(train_ds_df.values[:, j]) / (np.sum(np.clip(train_ds_df.values[:, j], 0, 1)) + EPSILON)
            # Calculate the mean rating of each of the K most similar items
            sim_item_mean = np.sum(sim_items, axis=1) / (np.sum(np.clip(sim_items, 0, 1), axis=1) + EPSILON)
            # Calculate the weighted sum of the deviations of the ratings of the K most similar items from their mean ratings
            sim_r_sum_mean = sim_val * (sim_items[:, i] - sim_item_mean)
            # Apply a weight to each deviation based on whether the user has rated the corresponding item
            w = np.clip(sim_items[:, i], 0, 1)
            sim_r_sum_mean *= w
            # Calculate the predicted rating for user i and item j by adding the mean rating of item j to the weighted sum of deviations
            np_predictions[i][j] = item_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val * w) + EPSILON)
            # Clip the predicted rating to be between 0 and 5
            np_predictions[i][j] = np.clip(np_predictions[i][j], 0, 5)
        else:
            # If the user has not rated the item, predict the user's average rating for other items
            user_mean = np.mean(train_ds[i][train_ds[i] > 0])
            np_predictions[i][j] = user_mean

    # Return the matrix of predicted ratings
    return np_predictions

# Predict ratings using AED similarity
np_predictions = predict_ratings(train_ds, np_item_aed_corr, test_ds, K)
# Evaluate the predictions using MAE and RMSE
MAE, RMSE = evaluate(test_ds, np_predictions)


MAE: 0.8048946319218885
RMSE: 1.0287592576312556


In [9]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))



NameError: name 'MAE' is not defined

In [None]:
# Initialize variables to store the best K value and the lowest validation MAE and RMSE
best_K = None
lowest_validation_MAE = float('inf')
lowest_validation_RMSE = float('inf')

# Split the training data into a smaller training set and a validation set
train_df_small, validation_df = train_test_split(train_df, test_size=0.2, random_state = 10)

# Convert the smaller training set and validation set into rating matrices
train_ds_small = np.zeros((n_users, n_items))
for row in train_df_small.itertuples():
    train_ds_small[row[1]-1, row[2]-1] = row[3]

# Convert the validation set into a rating matrix
validation_ds = np.zeros((n_users, n_items))
for row in validation_df.itertuples():
    validation_ds[row[1]-1, row[2]-1] = row[3]
    
# Initilize an output to store the K value, MAE, and RMSE
output = []

# For each K value
for K in range(1, 101):
    # Train the model on the smaller training set
    np_predictions_small = predict_ratings(train_ds_small, np_item_aed_corr, validation_ds, K)

    # Evaluate the model on the validation set
    validation_MAE, validation_RMSE = evaluate(validation_ds, np_predictions_small)
    
    # Append the K value, MAE, and RMSE to the output DF
    output.append([K, validation_MAE, validation_RMSE])
    
    # If this MAE and RMSE are lower than the lowest validation MAE and RMSE seen so far
    if validation_MAE < lowest_validation_MAE and validation_RMSE < lowest_validation_RMSE:
        # Update the best K value and the lowest validation MAE and RMSE
        best_K = K
        lowest_validation_MAE = validation_MAE
        lowest_validation_RMSE = validation_RMSE    


output_df = pd.DataFrame(output, columns=['K', 'MAE', 'RMSE'])

# Print the best K value
print(f"Best K: {best_K}, Lowest Validation MAE: {lowest_validation_MAE}, Lowest Validation RMSE: {lowest_validation_RMSE}")
output_df