# Assignment 3

In [1]:
import pandas as pd
import numpy as np

# Load MovieLens 100K dataset into a dataframe of pandas
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [2]:
# Select 500 most active users and 500 most active items from the dataset
n_most_active_users = 500
n_most_active_items = 500

user_ids = df.groupby('user_id').count().sort_values(by='rating', ascending=False).head(n_most_active_users).index
item_ids = df.groupby('item_id').count().sort_values(by='rating', ascending=False).head(n_most_active_items).index
df = df[(df['user_id'].isin(user_ids)) & (df['item_id'].isin(item_ids))]

In [3]:
# Map new internal ID for items
i_ids = df['item_id'].unique().tolist()
item_dict = dict(zip(i_ids, [i for i in range(len(i_ids))]))
df['item_id'] = df['item_id'].map(item_dict)

# Split Dataset

In [4]:
# The number of training users and active users
n_training_users = 300
n_active_users = n_most_active_users - n_training_users

# The number of GIVEN ratings for active users
GIVEN = 20

# Randomly select users from the most active users as training set
random_uids = np.random.choice(df.user_id.unique(), n_training_users, replace=False)
train_df = df[df['user_id'].isin(random_uids)]
# Map new internal ID for all users in the training set
u_ids = train_df['user_id'].unique().tolist()
user_dict = dict(zip(u_ids, [i for i in range(len(u_ids))]))
train_df['user_id'] = train_df['user_id'].map(user_dict)

# The rest of users are active users for testing
remain_df = df[~df['user_id'].isin(random_uids)]
# Map new internal ID for all active users
u_ids = remain_df['user_id'].unique().tolist()
user_dict = dict(zip(u_ids, [i for i in range(len(u_ids))]))
remain_df['user_id'] = remain_df['user_id'].map(user_dict)

# Randomly select GIVEN ratings for active users
active_df = remain_df.groupby('user_id').sample(n=GIVEN, random_state=1024)

test_df = remain_df[~remain_df.index.isin(active_df.index)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['user_id'] = train_df['user_id'].map(user_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  remain_df['user_id'] = remain_df['user_id'].map(user_dict)


In [5]:
# Convert the format of datasets to matrices
df_zeros = pd.DataFrame({'user_id': np.tile(np.arange(0, n_training_users), n_most_active_items), 'item_id': np.repeat(np.arange(0, n_most_active_items), n_training_users), 'rating': 0})
train_ds = df_zeros.merge(train_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')

df_zeros = pd.DataFrame({'user_id': np.tile(np.arange(0, n_active_users), n_most_active_items), 'item_id': np.repeat(np.arange(0, n_most_active_items), n_active_users), 'rating': 0})
active_ds = df_zeros.merge(active_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')
test_ds = df_zeros.merge(test_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')

train_ds, active_ds, test_ds

(item_id  0    1    2    3    4    5    6    7    8    9    ...  490  491  492  \
 user_id                                                    ...                  
 0        0.0  2.0  0.0  4.0  0.0  4.0  4.0  0.0  0.0  2.0  ...  0.0  4.0  4.0   
 1        4.0  0.0  0.0  2.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 2        0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  4.0  ...  0.0  0.0  0.0   
 3        4.0  0.0  5.0  0.0  0.0  3.0  4.0  2.0  0.0  2.0  ...  0.0  2.0  0.0   
 4        4.0  0.0  5.0  0.0  1.0  0.0  3.0  2.0  0.0  0.0  ...  0.0  0.0  0.0   
 ...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
 295      0.0  0.0  5.0  4.0  0.0  1.0  0.0  0.0  0.0  1.0  ...  0.0  0.0  0.0   
 296      4.0  0.0  3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 297      0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 298      0.0  0.0  0.0  3.0  0.0  0.0  0.0  4.0  0.0  0.0  ...  0.0  0.0  5.0   
 299      0.0  0

In [6]:
# Predicting All Missing Data in training set
imputed_train_ds = train_ds.values.copy()

# Your implementation to predict the missing values
(Put all your implementation for your algorithm in the following cell only to handle the missing values; )

In [7]:
## Put all your implementation for your solutioin in this cell only to predict the missing values; 
## NOTE 1: DO NOT change anything in the rest of the cells in this framework, 
## otherwise the changes might cause errors and make your implementation invalid.

## Note 2: 
## The user-item rating matrix is imputed_train_ds, 
## and the missing values are those 0s in imputed_train_ds. 
## You are required to predict them by using the solution in the given report. 

## The following parameters are required in the given report, 
## which is named "Effective Missing Data Prediction for Collaborative Filtering", 
## and you will need to use them. But, please do not change their values. 
LAMBDA = 0.7    # λ
GAMMA = 10      # γ
DELTA = 10      # δ
ITA = 0.7       # η
THETA = 0.7     # θ
EPSILON = 1e-9


### Code written by Ram

# Matrix containing user-based relations
pearson_mat = np.zeros((len(df['user_id'].unique()),len(df['user_id'].unique())))

for i,user_vector1 in enumerate(imputed_train_ds):
    for j,user_vector2 in enumerate(imputed_train_ds):
        
        # Masking the vectors with rating 0
        mask_i = user_vector1>0
        mask_j = user_vector2>0
        
        # Finding intersection of items between users
        corratedItems = np.intersect1d(np.where(mask_i),np.where(mask_j))
        
        # Skipping if there are no common items
        if len(corratedItems)==0:
            continue
        
        # Removing items with 0 rating
        user_masked_vector1 = user_vector1[mask_i]
        user_masked_vector2 = user_vector2[mask_j]
        
        # Finding mean of every user
        mean_vector1 = np.sum(user_masked_vector1)/(np.sum(np.clip(user_vector1,0,1))+EPSILON)
        mean_vector2 = np.sum(user_masked_vector2)/(np.sum(np.clip(user_vector2,0,1))+EPSILON)
        
        # Subtracting mean
        subtracted_i = user_vector1[corratedItems]-mean_vector1
        subtracted_j = user_vector2[corratedItems]-mean_vector2
        
        # Multiplying the subtracted mean with each other
        numerator = subtracted_i * subtracted_j
        
        # Taking the summition
        numerator = np.sum(numerator)
        
        # Squaring the values for denominator
        squared_i = np.square(subtracted_i)
        squared_j = np.square(subtracted_j)
        
        # Taking the squareroot
        sqrt_i = np.sqrt(np.sum(squared_i))
        sqrt_j = np.sqrt(np.sum(squared_j))
        
        # Completing the denominator
        denominator = sqrt_i*sqrt_j
        
        # The similarity
        sim = numerator/(denominator+EPSILON)
        
        # The final similarity with significance weighting
        pearson_mat[i][j] = (min(len(corratedItems),GAMMA)/GAMMA)*sim
        
### Item-based interaction matrix
item_mat = np.zeros((len(df['item_id'].unique()),len(df['item_id'].unique())))

for i,item_i in enumerate(imputed_train_ds.T):
    
    # Masking the vectors with rating 0
    mask_i = item_i>0
                    
    for j,item_j in enumerate(imputed_train_ds.T):
        
        # Masking the vectors with rating 0
        mask_j = item_j>0
        
        # Finding intersection of items between users
        corratedIndices = np.intersect1d(np.where(mask_i),np.where(mask_i))
        
         # Skipping if there are no common items
        if(len(corratedIndices)==0):
            continue
        
        # Finding mean of every user
        mean_item_i = np.sum(item_i)/np.sum(np.clip(item_i,0,1))+EPSILON
        mean_item_j = np.sum(item_j)/np.sum(np.clip(item_j,0,1))+EPSILON
        
         # Subtracting mean from item ratings
        subtracted_item_i = item_i[corratedIndices] - mean_item_i
        subtracted_item_j = item_j[corratedIndices] - mean_item_j
        
        # Multiplying the subtracted mean with each other
        subtracted = subtracted_item_i*subtracted_item_j
        
        # Summing the subtracted values to get the numerator
        numerator = np.sum(subtracted)
        
        # Finding the squareroot of sum of the squares of subtracted values
        sqrt_i = np.sqrt(np.sum(np.square(subtracted_item_i)))
        sqrt_j = np.sqrt(np.sum(np.square(subtracted_item_j)))
        
        # Multiplying the squareroot to find the denominator
        denominator = sqrt_i*sqrt_j
        
        # Finding the similarity between the items
        sim = numerator/denominator+EPSILON
        
        # The item matrix with significance weighting
        item_mat[i][j] = (min(len(corratedIndices),DELTA)/DELTA)*sim
        
        
k=10
np_predictions = np.zeros((imputed_train_ds.shape[0],imputed_train_ds.shape[1]))
np_predictions2 = np.zeros((imputed_train_ds.shape[0],imputed_train_ds.shape[1]))

## Predictions

# User-based predictions
for (i,j),rating in np.ndenumerate(imputed_train_ds):
    if rating==0:
        
        # Sorting the similar users list in ascending order
        sim_users_list = np.argsort(pearson_mat[i])
        
        # Filtering the above list with values only above ITA
        sim_users_ITA = sim_users_list>ITA
        sim_users_list = sim_users_list[sim_users_ITA]
        
        # Taking out top k similar users
        sim_users_id = sim_users_list[-k:]
        
        # Storing values and users in different variables
        sim_val = pearson_mat[i][sim_users_id]
        sim_users = imputed_train_ds[sim_users_id]
        
        # Finding out the mean of current user
        curr_user_mean = np.sum(imputed_train_ds[i]) / (np.sum(np.clip(imputed_train_ds[i], 0, 1)) + EPSILON)
        
        # Finding the mean of the users used to compare with
        sim_users_mean = np.sum(sim_users,axis=1)/ (np.sum(np.clip(sim_users, 0, 1),axis=1) + EPSILON)
        
        masked_j = sim_users[:,j]>0
        
        # Subtracting mean from each user_item
        subtracted_mean = sim_users[masked_j,j]-sim_users_mean[masked_j]
        
        # Summing up the multiplication of all
        numerator = np.sum(sim_val[masked_j]*subtracted_mean)
        denominator = np.sum(sim_val[masked_j])
        
        # Filling the predictions matrix with the calculated values
        np_predictions[i][j] = curr_user_mean + (numerator/(denominator+EPSILON))
        np_predictions[i][j] = np.clip(np_predictions[i][j], 0, 5)
        
# Item-based predictions
for (i, j), rating in np.ndenumerate(imputed_train_ds):
    if rating == 0:
        
        # Sorting the similar users list in ascending order
        sim_items_list = np.argsort(item_mat[j])
        
        # Filtering the above list with values only above ITA
        sim_items_THETA = sim_items_list>THETA
        sim_items_list = sim_items_list[sim_items_THETA]
        
        # Taking out top k similar users
        sim_items_id = sim_items_list[-k:]
        
        # Storing values and users in different variables
        sim_val = item_mat[j][sim_items_id]
        sim_items = imputed_train_ds.T[sim_items_id]
        
        # Finding the mean of the current item
        item_mean = np.sum(imputed_train_ds.T[j]) / (np.sum(np.clip(imputed_train_ds.T[j], 0, 1)) + EPSILON)
        
        # Finding the mean of the users used to compare with
        sim_items_mean = np.sum(sim_items, axis=1) / (np.sum(np.clip(sim_items, 0, 1), axis=1) + EPSILON)
        
        # Clipping similar items
        clipped_val = np.clip(sim_items[:, i], 0, 1)
        
        # Assigning numerator
        numerator = sim_val * (sim_items[:, i] - sim_items_mean)
        
        # Filtering unrated items
        numerator*=clipped_val
        numerator = np.sum(numerator)
        
        # Concluding denominator
        denominator = (np.sum(sim_val * clipped_val))
                       
        np_predictions2[i][j] = item_mean + numerator /( denominator  + EPSILON)
        np_predictions2[i][j] = np.clip(np_predictions2[i][j], 0, 5)

## Inducing predicted values to missing values present in the data
prediction_mat = np.zeros((np_predictions.shape[0],np_predictions.shape[1]))

for (i,j),rating in np.ndenumerate(np_predictions):
    
    if(rating==0 and np_predictions2[i][j]!=0):
        prediction_mat[i][j] = np_predictions2[i][j]
        
    elif(rating!=0 and np_predictions2[i][j]==0):
        prediction_mat[i][j] = np_predictions[i][j]
    
    elif(rating!=0 and np_predictions2[i][j]!=0):
        prediction_mat[i][j] = (LAMBDA*np_predictions[i][j]) + ((1-LAMBDA)*np_predictions2[i][j])
        
    else:
        pass
    
imputed_train_ds = prediction_mat

# Evaluation

### Compute Pearson Correlation Coefficient of All Pairs of Items between active set and imputed training set

In [8]:
imputed_train_ds = pd.DataFrame(imputed_train_ds)
imputed_train_ds

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,4.581753,0.000000,3.660598,0.000000,3.467222,0.000000,0.000000,4.019763,3.569144,0.000000,...,3.018394,0.000000,0.000000,3.799147,0.000000,0.000000,3.053680,3.765139,3.449745,3.622115
1,0.000000,3.373278,4.939256,0.000000,3.687125,3.580253,4.444808,4.016255,4.038264,3.831367,...,2.419894,4.205243,3.785297,3.344544,4.161691,3.390603,1.558498,3.287280,3.643226,3.797516
2,4.183708,3.858869,3.892563,3.620879,0.000000,3.401975,3.779914,4.669088,3.020150,0.000000,...,1.986957,3.559077,3.813724,4.419235,3.882806,3.564738,3.374836,3.710950,3.606957,3.875243
3,0.000000,2.934184,0.000000,3.258543,3.529573,0.000000,0.000000,0.000000,3.645551,0.000000,...,1.731490,0.000000,3.473887,3.332913,3.673560,2.952169,1.841322,0.000000,3.291298,3.264462
4,0.000000,3.202663,0.000000,3.431336,0.000000,3.359742,0.000000,0.000000,2.970976,2.985576,...,3.039851,3.083481,3.082365,0.000000,0.000000,1.469846,3.003057,3.173929,2.412892,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,3.779814,2.268861,0.000000,0.000000,2.792787,0.000000,2.589836,2.662405,2.529035,0.000000,...,2.359504,1.797482,2.943358,2.674360,2.764667,1.458235,1.527491,3.177193,0.000000,2.639480
296,0.000000,3.500880,0.000000,3.246436,3.348096,4.120869,4.326042,3.468269,3.518608,3.986663,...,2.970124,2.397138,3.548340,3.312995,3.550024,2.661550,1.193254,3.267224,1.667009,3.465007
297,4.215449,4.385284,4.305475,0.000000,4.785224,4.447656,4.672000,4.723955,4.341439,4.271970,...,3.027945,3.935733,4.154304,3.760014,3.427416,3.641603,3.982757,4.600000,3.919506,3.916202
298,2.664052,2.956111,3.395053,0.000000,3.218173,1.861516,2.934168,0.000000,2.152368,3.417271,...,3.330574,3.012024,0.000000,3.043877,1.046334,0.000000,1.933137,3.601184,2.950766,2.660998


In [9]:
active_user_pearson_corr = np.zeros((active_ds.shape[0], train_ds.shape[0]))

# Compute Pearson Correlation Coefficient of All Pairs of Users between active set and imputed training set
for i, user_i_vec in enumerate(active_ds.values):
    for j, user_j_vec in enumerate(imputed_train_ds.values):
        
        # ratings corated by the current pair od users
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0

        # corrated item index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of user_i_vec and user_j_vec
        mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
        mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr
        user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
        user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

        r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
        r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

        r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
        r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

        sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)

        # significance weighting
        weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim

        active_user_pearson_corr[i][j] = weighted_sim

active_user_pearson_corr

array([[ 0.19433452,  0.04046047,  0.62382812, ...,  0.21078391,
         0.07517011, -0.5856483 ],
       [ 0.34948738,  0.50167079,  0.20798487, ...,  0.57602789,
         0.62314417,  0.21467716],
       [ 0.59471376,  0.27149825,  0.56048777, ...,  0.655147  ,
         0.81728031,  0.58021325],
       ...,
       [-0.34524592,  0.18517001, -0.00779502, ..., -0.14471385,
         0.08747431,  0.43615217],
       [ 0.39528873,  0.43940098,  0.07606676, ...,  0.16376734,
         0.18582829,  0.72253193],
       [ 0.28577773,  0.30786392,  0.12933753, ...,  0.42801929,
         0.35121057,  0.33231439]])

## Predict Ratings of Testing Set

In [10]:
K = 10

test_ds_pred = np.zeros_like(test_ds.values)

for (i, j), rating in np.ndenumerate(test_ds.values):

    if rating > 0:

        sim_user_ids = np.argsort(active_user_pearson_corr[i])[-1:-(K + 1):-1]

        #==================user-based==================#
        # the coefficient values of similar users
        sim_val = active_user_pearson_corr[i][sim_user_ids]

        # the average value of the current user's ratings
        sim_users = imputed_train_ds.values[sim_user_ids]
        user_mean = np.sum(active_ds.values[i]) / (np.sum(np.clip(active_ds.values[i], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # select the users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        # sim(u, v) * (r_vj - mean_v)
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
        
        user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        user_based_pred = np.clip(user_based_pred, 0, 5)

        test_ds_pred[i][j] = user_based_pred
        
test_ds_pred

array([[0.        , 0.        , 0.        , ..., 0.        , 3.22804324,
        0.        ],
       [0.        , 0.        , 4.22202592, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 4.37742322,
        0.        ],
       ...,
       [4.29209929, 0.        , 4.6677238 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 3.44100324, 0.        , ..., 0.        , 0.        ,
        0.        ]])

## Compute MAE and RMSE

In [11]:
# MAE
MAE = np.sum(np.abs(test_ds_pred - test_ds.values)) / np.sum(np.clip(test_ds.values, 0, 1))

# RMSE
RMSE = np.sqrt(np.sum(np.square(test_ds_pred - test_ds.values)) / np.sum(np.clip(test_ds.values, 0, 1)))

print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.755930374411552, RMSE: 0.9691529370013368
