In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt


In [20]:
train_data_matrix = np.loadtxt("./dataset/np_matrix_1.txt") 
print("shape: ",train_data_matrix.shape)

shape:  (2511, 4499)


# Similarity basd Collaborative Filter System



In [99]:
def get_union(vector1,vector2):
    new_vector1 = vector1[(vector1!=0 )& (vector2 != 0)]
    new_vector2 = vector2[(vector1!=0 )& (vector2 != 0)]
    return new_vector1,new_vector2

In [100]:
def cosine_similarity(vector1, vector2):
    new_vector1, new_vector2 = get_union(vector1,vector2)
    if len(new_vector1)==0 or len(new_vector2)==0:
        return 0
    dot_product = (new_vector1*new_vector2).sum()
    normA = ((new_vector1)**2).sum()
    normB = ((new_vector2)**2).sum()
    return round(dot_product / ((normA**0.5)*(normB**0.5)), 2)

In [101]:
def pearson_similarity(vector1, vector2):
    new_vector1, new_vector2 = get_union(vector1,vector2)
    if len(new_vector1)==0 or len(new_vector2)==0:
        return 0
    new_vector1_mean = new_vector1 - np.mean(new_vector1)
    new_vector2_mean = new_vector2 - np.mean(new_vector2)
    norm = np.linalg.norm(new_vector1_mean)*np.linalg.norm(new_vector2_mean)
    return np.dot(new_vector1_mean,new_vector2_mean)/norm

In [108]:
def user_sim(train_data_matrix, algo = "cosine"):
    user_similarity = np.zeros((train_data_matrix.shape[0], train_data_matrix.shape[0]))
    for i, cur in enumerate(user_similarity):
        for j,v in enumerate(cur):
            if user_similarity[i][j] == 0:
                if algo == "cosine":
                    user_similarity[i][j] = cosine_similarity(train_data_matrix[i], train_data_matrix[j])
                else:
                    user_similarity[i][j] = pearson_similarity(train_data_matrix[i], train_data_matrix[j])
                user_similarity[j][i] = user_similarity[i][j]
    print(user_similarity.max(), user_similarity.min())
    print (user_similarity.shape)
    print(user_similarity)
    return user_similarity

In [102]:
def item_sim(train_data_matrix, algo = "cosine"):
    train_data_matrix_t = train_data_matrix.T
    
    item_similarity = np.zeros((train_data_matrix.shape[1], train_data_matrix.shape[1]))
    for i, cur in enumerate(item_similarity):
        for j,v in enumerate(cur):
            if item_similarity[i][j] == 0:
                if algo == "cosine":
                    item_similarity[i][j] = cosine_similarity(train_data_matrix_t[i], train_data_matrix_t[j])
                else:
                    item_similarity[i][j] = pearson_similarity(train_data_matrix_t[i], train_data_matrix_t[j])
                item_similarity[j][i] = item_similarity[i][j]
    print(item_similarity.max(), item_similarity.min())
    print (item_similarity.shape)
    print(item_similarity)
    return item_similarity

In [104]:
def predict_user(ratings, similarity):
    mean_user_rating = ratings.mean(axis=1)
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
    
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
    for line in pred:
        line[np.isnan(line)] = np.nanmean(line)
    return pred

In [122]:
def predict_item(ratings, similarity):
    pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    for line in pred:
        line[np.isnan(line)] = np.nanmean(line)
    return pred

In [127]:
def rmsefun(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

def maefun(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    absError = abs(prediction - ground_truth)
    return sum(absError) / len(absError)

In [72]:
sp_train_matrix = train_data_matrix[:100]
sp_test_matrix = train_data_matrix[100:200]

In [109]:
user_similarity = user_sim(sp_train_matrix)

1.0 0.0
(100, 100)
[[1.   0.   0.98 ... 0.97 0.98 0.84]
 [0.   1.   1.   ... 0.   0.   0.  ]
 [0.98 1.   1.   ... 0.98 0.98 1.  ]
 ...
 [0.97 0.   0.98 ... 1.   0.99 0.97]
 [0.98 0.   0.98 ... 0.99 1.   1.  ]
 [0.84 0.   1.   ... 0.97 1.   1.  ]]


In [129]:
user_pred = predict_user(sp_train_matrix, user_similarity)
print(user_pred)

[[-0.00423636 -0.00423636  0.08122311 ... -0.00423636 -0.00423636
  -0.00423636]
 [-0.08557457 -0.08557457  0.20854307 ... -0.08557457 -0.08557457
  -0.08557457]
 [-0.0228834  -0.0228834   0.05139279 ... -0.0228834  -0.0228834
  -0.0228834 ]
 ...
 [ 0.02555048  0.02555048  0.11502741 ...  0.02555048  0.02555048
   0.02555048]
 [-0.03496    -0.03496     0.02945759 ... -0.03496    -0.03496
  -0.03496   ]
 [-0.0256153  -0.0256153   0.03668156 ... -0.0256153  -0.0256153
  -0.0256153 ]]


In [131]:
user_pred.shape

(100, 4499)

In [130]:
user_rmse = rmsefun(user_pred, sp_test_matrix) # userbase rmse
user_mae = maefun(user_pred, sp_test_matrix) # userbase mae
print("user-base model RMSE : %.3f"%(user_rmse))
print("user-base model MAE : %.3f"%(user_mae))

user-base model RMSE : 3.250
user-base model MAE : 3.048


In [117]:
item_similarity = item_sim(sp_train_matrix)

1.0 0.0
(4499, 4499)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [123]:
item_pred = predict_item(sp_train_matrix, item_similarity)
print(item_pred)

[[0.34424976 0.34424976 0.5956721  ... 0.34424976 0.34424976 0.34424976]
 [0.01063755 0.01063755 0.05329922 ... 0.01063755 0.01063755 0.01063755]
 [0.23135058 0.23135058 0.25583626 ... 0.23135058 0.23135058 0.23135058]
 ...
 [0.47861012 0.47861012 0.33013538 ... 0.47861012 0.47861012 0.47861012]
 [0.12873536 0.12873536 0.24517642 ... 0.12873536 0.12873536 0.12873536]
 [0.14570407 0.14570407 0.54365206 ... 0.14570407 0.14570407 0.14570407]]


  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app


In [132]:
item_pred.shape

(100, 4499)

In [128]:
item_rmse = rmsefun(item_pred, sp_train_matrix) # itembase rmse
item_mae = maefun(item_pred, sp_train_matrix) # itembase mae
print("item_rmse model RMSE : %.3f"%(item_rmse))
print("item_rmse model MAE : %.3f"%(item_mae))

item_rmse model RMSE : 2.898
item_rmse model MAE : 2.552


In [None]:
# test_data_matrix = np.loadtxt("./dataset/np_matrix_2.txt") 
# print("shape: ",test_data_matrix.shape)