In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt


In [2]:
train_data_matrix = np.loadtxt("./dataset/sprase_train_2.txt") 
print("shape: ",train_data_matrix.shape)

test_data_matrix = np.loadtxt("./dataset/sprase_test_2.txt") 
print("shape: ",test_data_matrix.shape)

shape:  (131098, 4499)
shape:  (14566, 4499)


# Similarity basd Collaborative Filter System



In [3]:
def get_union(vector1,vector2):
    new_vector1 = vector1[(vector1!=0 )& (vector2 != 0)]
    new_vector2 = vector2[(vector1!=0 )& (vector2 != 0)]
    return new_vector1,new_vector2

In [4]:
def cosine_similarity(vector1, vector2):
    new_vector1, new_vector2 = get_union(vector1,vector2)
    if len(new_vector1)==0 or len(new_vector2)==0:
        return 0
    dot_product = (new_vector1*new_vector2).sum()
    normA = ((new_vector1)**2).sum()
    normB = ((new_vector2)**2).sum()
    return round(dot_product / ((normA**0.5)*(normB**0.5)), 2)

In [5]:
def pearson_similarity(vector1, vector2):
    new_vector1, new_vector2 = get_union(vector1,vector2)
    if len(new_vector1)==0 or len(new_vector2)==0:
        return 0
    new_vector1_mean = new_vector1 - np.mean(new_vector1)
    new_vector2_mean = new_vector2 - np.mean(new_vector2)
    norm = np.linalg.norm(new_vector1_mean)*np.linalg.norm(new_vector2_mean)
    return np.dot(new_vector1_mean,new_vector2_mean)/norm

In [6]:
def user_sim(train_data_matrix, algo = "cosine"):
    user_similarity = np.zeros((train_data_matrix.shape[0], train_data_matrix.shape[0]))
    for i, cur in enumerate(user_similarity):
        for j,v in enumerate(cur):
            if user_similarity[i][j] == 0:
                if algo == "cosine":
                    user_similarity[i][j] = cosine_similarity(train_data_matrix[i], train_data_matrix[j])
                else:
                    user_similarity[i][j] = pearson_similarity(train_data_matrix[i], train_data_matrix[j])
                user_similarity[j][i] = user_similarity[i][j]
    print(user_similarity.max(), user_similarity.min())
    print (user_similarity.shape)
    print(user_similarity)
    return user_similarity

In [7]:
def item_sim(train_data_matrix, algo = "cosine"):
    train_data_matrix_t = train_data_matrix.T
    
    item_similarity = np.zeros((train_data_matrix.shape[1], train_data_matrix.shape[1]))
    for i, cur in enumerate(item_similarity):
        for j,v in enumerate(cur):
            if item_similarity[i][j] == 0:
                if algo == "cosine":
                    item_similarity[i][j] = cosine_similarity(train_data_matrix_t[i], train_data_matrix_t[j])
                else:
                    item_similarity[i][j] = pearson_similarity(train_data_matrix_t[i], train_data_matrix_t[j])
                item_similarity[j][i] = item_similarity[i][j]
    print(item_similarity.max(), item_similarity.min())
    print (item_similarity.shape)
    print(item_similarity)
    return item_similarity

In [8]:
def predict_user(ratings, similarity):
    mean_user_rating = ratings.mean(axis=1)
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
    
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
    for line in pred:
        line[np.isnan(line)] = np.nanmean(line)
    return pred

In [9]:
def predict_item(ratings, similarity):
    pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    for line in pred:
        line[np.isnan(line)] = np.nanmean(line)
    return pred

In [10]:
def rmsefun(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

def maefun(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    absError = abs(prediction - ground_truth)
    return sum(absError) / len(absError)

In [11]:
sp_train_matrix = train_data_matrix[:4499]
sp_test_matrix = test_data_matrix[:4499]

In [12]:
user_similarity = user_sim(sp_train_matrix)

1.0 0.0
(4499, 4499)
[[1.   0.97 0.93 ... 0.93 0.95 0.96]
 [0.97 1.   0.96 ... 0.96 0.95 0.95]
 [0.93 0.96 1.   ... 0.93 0.93 0.95]
 ...
 [0.93 0.96 0.93 ... 1.   0.92 0.92]
 [0.95 0.95 0.93 ... 0.92 1.   0.95]
 [0.96 0.95 0.95 ... 0.92 0.95 1.  ]]


In [13]:
user_pred = predict_user(sp_train_matrix, user_similarity)
print(user_pred)

[[ 0.53382661  0.28935212  1.35478952 ...  0.28494171  0.25181243
   0.2809621 ]
 [ 0.28753751  0.04383394  1.11065471 ...  0.03915908  0.00583597
   0.03423892]
 [ 0.40029968  0.15462279  1.21491216 ...  0.1499857   0.11714905
   0.14606903]
 ...
 [ 0.23867768 -0.00478189  1.0610564  ... -0.00912415 -0.04244602
  -0.01317067]
 [ 0.36171647  0.11843268  1.1864431  ...  0.11406682  0.08119862
   0.1103375 ]
 [ 0.21034627 -0.03500737  1.02895119 ... -0.04043912 -0.07248974
  -0.04387082]]


In [14]:
user_pred.shape

(4499, 4499)

In [15]:
user_rmse = rmsefun(user_pred, sp_test_matrix) # userbase rmse
user_mae = maefun(user_pred, sp_test_matrix) # userbase mae
print("user-base model RMSE : %.3f"%(user_rmse))
print("user-base model MAE : %.3f"%(user_mae))

user-base model RMSE : 2.887
user-base model MAE : 2.639


In [16]:
item_similarity = item_sim(sp_train_matrix)

1.0 0.0
(4499, 4499)
[[1.   0.98 0.95 ... 0.94 0.82 0.98]
 [0.98 1.   0.98 ... 0.95 0.95 0.99]
 [0.95 0.98 1.   ... 0.94 0.92 0.94]
 ...
 [0.94 0.95 0.94 ... 1.   0.93 0.93]
 [0.82 0.95 0.92 ... 0.93 1.   0.91]
 [0.98 0.99 0.94 ... 0.93 0.91 1.  ]]


In [17]:
item_pred = predict_item(sp_train_matrix, item_similarity)
print(item_pred)

[[0.3863222  0.37036744 0.38037886 ... 0.38559788 0.37399926 0.37246983]
 [0.13879973 0.13101939 0.13519077 ... 0.13759923 0.13221575 0.13201399]
 [0.25022545 0.23580179 0.24380928 ... 0.24854951 0.23817976 0.24175168]
 ...
 [0.08923463 0.08489892 0.08834521 ... 0.08903017 0.08594317 0.08553445]
 [0.21392312 0.2054787  0.21302803 ... 0.21340866 0.20746729 0.20685543]
 [0.05910673 0.055394   0.05795342 ... 0.05848939 0.05606327 0.05607349]]


In [18]:
item_pred.shape

(4499, 4499)

In [19]:
item_rmse = rmsefun(item_pred, sp_train_matrix) # itembase rmse
item_mae = maefun(item_pred, sp_train_matrix) # itembase mae
print("item_rmse model RMSE : %.3f"%(item_rmse))
print("item_rmse model MAE : %.3f"%(item_mae))

item_rmse model RMSE : 3.254
item_rmse model MAE : 3.002


In [None]:
# test_data_matrix = np.loadtxt("./dataset/np_matrix_2.txt") 
# print("shape: ",test_data_matrix.shape)