In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'pandas'

In [None]:
def compute_prediction(userid, itemid, similarity_mtx, utility):
    user_rating = utility.iloc[:,userid-1]
    item_similarity = similarity_mtx[itemid-1]
    numerate = np.dot(user_rating, item_similarity)
    denom = item_similarity[user_rating > 0].sum()
            
    if denom == 0 or numerate == 0:
        return user_rating[user_rating>0].mean()
    
    return numerate / denom

def compute_rmse(test_set, test_real, similarity_mtx, utility, pred_func, **kwargs):
    pred = []
    for data in test_set:
        res = pred_func(userid = data[0], 
                        itemid = data[1], 
                        similarity_mtx = similarity_mtx, 
                        utility = utility, 
                        **kwargs)
        pred.append(res)
    rmse = np.sqrt(np.mean((test_real - pred)**2))
    return rmse

In [None]:
train_path = '../data/ml-100k/u1.base'
test_path = '../data/ml-100k/u1.test'

# load train and test data
df = pd.read_csv(train_path, delimiter = '\t', names = ['userid', 'itemid', 'rating', 'timestamp'])
test = pd.read_csv(test_path, delimiter = '\t', names = ['userid', 'itemid', 'rating', 'timestamp'])
test_set = test[['userid', 'itemid']].to_numpy()
test_real = test['rating'].to_numpy()

# construct the utility matrix
utility = df.pivot(index = 'itemid', columns = 'userid', values = 'rating')
utility = utility.fillna(0)

# calculate the similarity
similarity_mtx = 1 - squareform(pdist(utility, 'cosine'))

In [None]:
compute_rmse(test_set, test_real, similarity_mtx, utility, compute_prediction)

In [None]:
# take the top n similar items
def compute_prediction_v1(userid, itemid, similarity_mtx, utility, top_n):
    user_rating = utility.iloc[:,userid-1]
    item_similarity = similarity_mtx[itemid-1]
    
    # we change the data structure to a list of (user, sim) pairs, and then filter out the non_rated pairs
    user_sim_pair_list = list(zip(user_rating, item_similarity))
    user_sim_pair_list = sorted(filter(lambda x: x[0] != 0 and x[1] != 0, user_sim_pair_list), 
                                key = lambda x: -x[1])[:top_n]

    numerate = sum([x[0] * x[1] for x in user_sim_pair_list])
    denom = sum([x[1] for x in user_sim_pair_list])
            
    if denom == 0 or numerate == 0:
        return user_rating[user_rating>0].mean()
    
    return numerate / denom

In [None]:
chart_val = []
x_val = list(range(10,25))

for x in x_val:
    rmse = compute_rmse(test_set, test_real, similarity_mtx, utility, compute_prediction_v1, top_n = x)
    chart_val.append([x,rmse])

chart_val_np = np.array(chart_val)
plt.plot(chart_val_np[:, 0], chart_val_np[:,1])

In [None]:
best_model = min(chart_val, key= lambda x: x[1])
print("****************")
print(f'best model:')
print(f'top_n = {best_model[0]}')
print(f'rmse = {best_model[1]}')
print("****************")

## Case Amplification

In [None]:
def compute_prediction_v2(userid, itemid, similarity_mtx, utility, top_n, amp):
    user_rating = utility.iloc[:,userid-1]
    item_similarity = similarity_mtx[itemid-1]
    item_similarity = item_similarity ** amp
    
    # we change the data structure to a list of (user, sim) pairs, and then filter out the non_rated pairs
    user_sim_pair_list = list(zip(user_rating, item_similarity))
    user_sim_pair_list = sorted(filter(lambda x: x[0] != 0 and x[1] != 0, user_sim_pair_list), key = lambda x: -x[1])[:top_n]

    numerate = sum([x[0] * x[1] for x in user_sim_pair_list])
    denom = sum([x[1] for x in user_sim_pair_list])
            
    if denom == 0 or numerate == 0:
        return user_rating[user_rating>0].mean()
    
    return numerate / denom

In [None]:
chart_val = []
x_val = np.linspace(1,10,19)

for x in x_val:
    rmse = compute_rmse(test_set, test_real, similarity_mtx, utility, 
                        compute_prediction_v2, 
                        top_n = -1,
                        amp = x)
    chart_val.append([x,rmse])

chart_val_np = np.array(chart_val)
plt.plot(chart_val_np[:, 0], chart_val_np[:,1])

In [None]:
best_model = min(chart_val, key= lambda x: x[1])
print("****************")
print(f'best model:')
print(f'amp = {best_model[0]}')
print(f'rmse = {best_model[1]}')
print("****************")

## Default Voting

In [None]:
def default_voting_item_amp(df, similarity_mtx, threshold, amp):
    new_sim = deepcopy(similarity_mtx)
    item_count = item_count = df.groupby('itemid').size().reset_index(name='count').sort_values('count')
    amp_item = item_count.loc[item_count['count'] > threshold]['itemid'].to_numpy()
    
    for item in amp_item:
        indx = item - 1
        new_sim[:,indx] *= amp
        new_sim[indx,:] *= amp
    
    return new_sim

In [None]:
item_count = df.groupby('itemid').size().reset_index(name='count').sort_values('count')
ax = item_count['count'].hist(bins = 30, figsize = (10,8))
ax.set_xlabel('count')
ax.set_ylabel('number of items')

In [None]:
chart_val = []
x_val = np.linspace(0.1, 2, 20)
threshold = 100

for amp in x_val:
    new_sim = default_voting_item_amp(df, similarity_mtx, threshold, amp)
    rmse = compute_rmse(test_set, test_real, new_sim, utility, compute_prediction)
    chart_val.append([amp,rmse])

chart_val_np = np.array(chart_val)
plt.plot(chart_val_np[:, 0], chart_val_np[:,1])

In [None]:
best_model = min(chart_val, key= lambda x: x[1])
print("****************")
print(f'best model:')
print(f'amp = {best_model[0]}')
print(f'rmse = {best_model[1]}')