In [8]:
# import libraries
import numpy as np
import pandas as pd

from surprise import KNNBasic, KNNWithMeans
from surprise import SVD, NMF
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split, KFold

In [2]:
# create example data for collaborative filtering
UserItemMatrix = np.array([np.array([5, np.nan, 4, np.nan, 1, np.nan, 3]),
                           np.array([4, 4, 4, np.nan, np.nan, np.nan, 1]),
                           np.array([5, 4, np.nan, 1, 2, np.nan, 3]),
                           np.array([1, 2, 1, 4, 3, 5, 2]),
                           np.array([np.nan, 1, np.nan, 3, 5, 5, np.nan]),
                           np.array([np.nan, 2, np.nan, np.nan, 4, 4, 2]),
                           np.array([5, np.nan, np.nan, 1, np.nan, np.nan, 2])
                          ])

In [3]:
UserItemMatrix

array([[ 5., nan,  4., nan,  1., nan,  3.],
       [ 4.,  4.,  4., nan, nan, nan,  1.],
       [ 5.,  4., nan,  1.,  2., nan,  3.],
       [ 1.,  2.,  1.,  4.,  3.,  5.,  2.],
       [nan,  1., nan,  3.,  5.,  5., nan],
       [nan,  2., nan, nan,  4.,  4.,  2.],
       [ 5., nan, nan,  1., nan, nan,  2.]])

In [4]:
# explore User-Item matrix
df = pd.DataFrame(UserItemMatrix, 
                  columns=['item1', 'item2', 'item3', 'item4', 'item5', 'item6', 'item7'])
df['user_id'] = list(df.index)
df

Unnamed: 0,item1,item2,item3,item4,item5,item6,item7,user_id
0,5.0,,4.0,,1.0,,3.0,0
1,4.0,4.0,4.0,,,,1.0,1
2,5.0,4.0,,1.0,2.0,,3.0,2
3,1.0,2.0,1.0,4.0,3.0,5.0,2.0,3
4,,1.0,,3.0,5.0,5.0,,4
5,,2.0,,,4.0,4.0,2.0,5
6,5.0,,,1.0,,,2.0,6


In [5]:
# transform data into appropriate form for library
df = df.melt(id_vars='user_id', value_name='rating')
df.dropna(inplace=True)
df.variable = df.variable.str.replace('item', '')
df

Unnamed: 0,user_id,variable,rating
0,0,1,5.0
1,1,1,4.0
2,2,1,5.0
3,3,1,1.0
6,6,1,5.0
8,1,2,4.0
9,2,2,4.0
10,3,2,2.0
11,4,2,1.0
12,5,2,2.0


In [6]:
# transform data into appropriate form for library (cont'd)
ratings_dict = {'itemID': df['variable'].astype(np.int),
               'userID': df['user_id'].astype(np.int),
               'rating': df['rating'].astype(np.float)}
df = pd.DataFrame(ratings_dict)

reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

In [11]:
# learn KNN-based algorithm model (user-based similarities)
sim_options = {'name': 'pearson'}
# other similarity option: 'msd', 'cosine'

algo = KNNBasic(sim_options=sim_options)

In [15]:
# KFold evaluation
n_splits=5
cv = KFold(n_splits)
acc = np.zeros(shape=(n_splits,))
fcp = np.zeros(shape=(n_splits,))
for i, (trainset, testset) in enumerate(cv.split(data)):
    algo.fit(train_set)
    
    predictions = algo.test(testset)
    
    acc[i] = accuracy.rmse(predictions)
    fcp[i] = accuracy.fcp(predictions)

print('Average performance')
print(np.mean(acc))
print(np.mean(fcp))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.2548
FCP:  1.0000
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.3448
FCP:  1.0000
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.6016
FCP:  1.0000
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.4660
FCP:  1.0000
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.5992
FCP:  1.0000
Average performance
0.4532730353942995
1.0


In [19]:
# cross_validate
from surprise.model_selection import cross_validate

result = cross_validate(algo, data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [18]:
result

{'test_rmse': array([1.59410496, 1.22898332, 1.22507143, 1.29223321, 1.43517711]),
 'test_mae': array([1.41666667, 1.05333333, 0.84      , 1.01333333, 1.36      ]),
 'fit_time': (0.0, 0.000997304916381836, 0.0, 0.0, 0.0),
 'test_time': (0.0, 0.0, 0.0, 0.000972747802734375, 0.0)}

# Practice

### Read Joke dataset (pd.read_csv('UserRatings1.csv'))

### Compare KNNWithMeans with pearson, SVD with 50 factors
##### metric = 'rmse'