In [21]:
import pandas as pd
import numpy as np

In [22]:
df = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

In [23]:
n_users = df.user_id.unique().shape[0]  
n_items = df.item_id.unique().shape[0]

In [24]:
from sklearn import model_selection as cv

In [25]:
train_data, test_data = cv.train_test_split(df, test_size=0.25, random_state=42)

In [26]:
train_data

Unnamed: 0,user_id,item_id,rating,timestamp
98980,811,901,4,886377771
69824,804,755,3,879445305
9928,52,287,5,882922357
75599,735,181,4,876698604
95621,897,96,5,879990430
...,...,...,...,...
6265,216,231,2,880245109
54886,343,276,5,876403078
76820,437,475,3,880140288
860,284,322,3,885329671


In [27]:
user_item_matrix = np.zeros((n_users, n_items))
for row in train_data.itertuples():
    user_item_matrix[row[1]-1, row[2]-1] = row[3]
    
test_data_matrix = np.zeros((n_users, n_items))
for row in test_data.itertuples():
    test_data_matrix[row[1]-1, row[2]-1] = row[3]

In [28]:
from sklearn.metrics.pairwise import pairwise_distances

In [29]:
item_similarity = np.ones((n_items, n_items)) - pairwise_distances(user_item_matrix.T, metric='cosine')
user_similarity = np.ones((n_users,n_users))-pairwise_distances(user_item_matrix, metric='cosine')

### Collaborative Filtering

In [30]:
def cf_predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])    
    return pred

In [31]:
user_similarity.sum(axis=1).shape

(943,)

### A random walk method for alleviating the sparsity problem in collaborative filtering

In [32]:
alpha = 0.8
beta = 0.8
ones = np.eye(n_items)

In [44]:
def random_walk_predict(ratings, item_similarity):
    transition_matrix = np.zeros((n_items, n_items))

    for i in range(n_items):
        sum_ = item_similarity[i].sum()
        for j in range(n_items):
            transition_matrix[i][j] = beta*item_similarity[i][j]/sum_+(1-beta)/n_items

    p_tilde = np.linalg.pinv(ones-alpha*transition_matrix)
    final_rating = alpha*np.dot(ratings, np.dot(transition_matrix, p_tilde))
    print(transition_matrix)
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaled_prediction = (scaler.fit_transform(final_rating.T)*5).T
    return scaled_prediction

NameError: name 'transition_matrix' is not defined

In [34]:
user_based_prediction = cf_predict(user_item_matrix, user_similarity, type='user')
item_based_prediction = cf_predict(user_item_matrix, item_similarity, type='item')

In [46]:
randomwork_based_prediction = random_walk_predict(user_item_matrix, item_similarity)

[[3.75471545e-03 1.18899612e-03 1.03687883e-03 ... 1.18906064e-04
  1.18906064e-04 1.18906064e-04]
 [1.24034577e-03 3.92918443e-03 8.37370468e-04 ... 1.18906064e-04
  4.65613525e-04 1.18906064e-04]
 [1.44734878e-03 1.11102261e-03 5.38046139e-03 ... 1.18906064e-04
  1.18906064e-04 1.18906064e-04]
 ...
 [1.18906064e-04 1.18906064e-04 1.18906064e-04 ... 4.88769708e-02
  1.18906064e-04 1.18906064e-04]
 [1.18906064e-04 2.61679406e-03 1.18906064e-04 ... 1.18906064e-04
  2.75704313e-02 1.18906064e-04]
 [1.18906064e-04 1.18906064e-04 1.18906064e-04 ... 1.18906064e-04
  1.18906064e-04 8.00118906e-01]]


In [36]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, test_data_matrix):
    prediction = prediction[test_data_matrix.nonzero()].flatten()
    truth = test_data_matrix[test_data_matrix.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, truth))

In [45]:
print('rmse for user based collaborative: {}'.format(rmse(user_based_prediction, test_data_matrix)))
print('rmse for item based collaborative: {}'.format(rmse(item_based_prediction, test_data_matrix)))
print('rmse for random walk: {}'.format(rmse(randomwork_based_prediction, test_data_matrix)))

rmse for user based collaborative: 2.962115380864079
rmse for item based collaborative: 3.166875880606639
rmse for random walk: 1.3127336408309747
