In [1]:
# import libraries
import numpy as np
import pandas as pd

from surprise import KNNBasic, KNNWithMeans
from surprise import SVD, NMF
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

ModuleNotFoundError: No module named 'surprise'

In [2]:
# create example data for collaborative filtering
UserItemMatrix = np.array([np.array([5, np.nan, 4, np.nan, 1, np.nan, 3]),
                           np.array([4, 4, 4, np.nan, np.nan, np.nan, 1]),
                           np.array([5, 4, np.nan, 1, 2, np.nan, 3]),
                           np.array([1, 2, 1, 4, 3, 5, 2]),
                           np.array([np.nan, 1, np.nan, 3, 5, 5, np.nan]),
                           np.array([np.nan, 2, np.nan, np.nan, 4, 4, 2]),
                           np.array([5, np.nan, np.nan, 1, np.nan, np.nan, 2])
                          ])

In [3]:
UserItemMatrix

array([[ 5., nan,  4., nan,  1., nan,  3.],
       [ 4.,  4.,  4., nan, nan, nan,  1.],
       [ 5.,  4., nan,  1.,  2., nan,  3.],
       [ 1.,  2.,  1.,  4.,  3.,  5.,  2.],
       [nan,  1., nan,  3.,  5.,  5., nan],
       [nan,  2., nan, nan,  4.,  4.,  2.],
       [ 5., nan, nan,  1., nan, nan,  2.]])

In [7]:
# explore User-Item matrix
df = pd.DataFrame(UserItemMatrix, 
                  columns=['item1', 'item2', 'item3', 'item4', 'item5', 'item6', 'item7'])
df['user_id'] = list(df.index)
df

Unnamed: 0,item1,item2,item3,item4,item5,item6,item7,user_id
0,5.0,,4.0,,1.0,,3.0,0
1,4.0,4.0,4.0,,,,1.0,1
2,5.0,4.0,,1.0,2.0,,3.0,2
3,1.0,2.0,1.0,4.0,3.0,5.0,2.0,3
4,,1.0,,3.0,5.0,5.0,,4
5,,2.0,,,4.0,4.0,2.0,5
6,5.0,,,1.0,,,2.0,6


In [8]:
# transform data into appropriate form for library
df = df.melt(id_vars='user_id', value_name='rating')
df.dropna(inplace=True)
df.variable = df.variable.str.replace('item', '')
df

Unnamed: 0,user_id,variable,rating
0,0,1,5.0
1,1,1,4.0
2,2,1,5.0
3,3,1,1.0
6,6,1,5.0
8,1,2,4.0
9,2,2,4.0
10,3,2,2.0
11,4,2,1.0
12,5,2,2.0


In [9]:
# transform data into appropriate form for library (cont'd)
ratings_dict = {'itemID': df['variable'].astype(np.int),
               'userID': df['user_id'].astype(np.int),
               'rating': df['rating'].astype(np.float)}
df = pd.DataFrame(ratings_dict)

reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

In [16]:
# build trainset
train_set = data.build_full_trainset()

In [17]:
# learn KNN-based algorithm model (user-based similarities)
sim_options = {'name': 'pearson'}
# other similarity option: 'msd', 'cosine'

algo = KNNBasic(sim_options=sim_options)

algo.fit(train_set)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x22fa12b1610>

In [11]:
# predict unknown ratings from learned model
target_u = [6]
target_i = [2, 3, 5, 6]

predictions = []
for user in target_u:
    for item in target_i:
        predictions.append(algo.predict(user, item))
predictions

[Prediction(uid=6, iid=2, r_ui=None, est=3.5391656658658657, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=6, iid=3, r_ui=None, est=3.4166666666666665, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=6, iid=5, r_ui=None, est=1.0291616626626618, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=6, iid=6, r_ui=None, est=2.6666666666666665, details={'actual_k': 0, 'was_impossible': False})]

In [18]:
# build testset
test_set = train_set.build_testset()
test_set

[(0, 1, 5.0),
 (0, 3, 4.0),
 (0, 5, 1.0),
 (0, 7, 3.0),
 (1, 1, 4.0),
 (1, 2, 4.0),
 (1, 3, 4.0),
 (1, 7, 1.0),
 (2, 1, 5.0),
 (2, 2, 4.0),
 (2, 4, 1.0),
 (2, 5, 2.0),
 (2, 7, 3.0),
 (3, 1, 1.0),
 (3, 2, 2.0),
 (3, 3, 1.0),
 (3, 4, 4.0),
 (3, 5, 3.0),
 (3, 6, 5.0),
 (3, 7, 2.0),
 (6, 1, 5.0),
 (6, 4, 1.0),
 (6, 7, 2.0),
 (4, 2, 1.0),
 (4, 4, 3.0),
 (4, 5, 5.0),
 (4, 6, 5.0),
 (5, 2, 2.0),
 (5, 5, 4.0),
 (5, 6, 4.0),
 (5, 7, 2.0)]

In [20]:
# build testset
anti_test_set = train_set.build_anti_testset()
anti_test_set

[(0, 2, 3.0),
 (0, 4, 3.0),
 (0, 6, 3.0),
 (1, 4, 3.0),
 (1, 5, 3.0),
 (1, 6, 3.0),
 (2, 3, 3.0),
 (2, 6, 3.0),
 (6, 2, 3.0),
 (6, 3, 3.0),
 (6, 5, 3.0),
 (6, 6, 3.0),
 (4, 1, 3.0),
 (4, 3, 3.0),
 (4, 7, 3.0),
 (5, 1, 3.0),
 (5, 3, 3.0),
 (5, 4, 3.0)]

In [25]:
algo.test(test_set)

[Prediction(uid=0, iid=1, r_ui=5.0, est=4.77494176878252, details={'actual_k': 4, 'was_impossible': False}),
 Prediction(uid=0, iid=3, r_ui=4.0, est=4.0, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=0, iid=5, r_ui=1.0, est=1.49545416973504, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=0, iid=7, r_ui=3.0, est=2.2900086768115364, details={'actual_k': 4, 'was_impossible': False}),
 Prediction(uid=1, iid=1, r_ui=4.0, est=4.732050807568878, details={'actual_k': 4, 'was_impossible': False}),
 Prediction(uid=1, iid=2, r_ui=4.0, est=4.0, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=1, iid=3, r_ui=4.0, est=4.0, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=1, iid=7, r_ui=1.0, est=2.196152422706632, details={'actual_k': 4, 'was_impossible': False}),
 Prediction(uid=2, iid=1, r_ui=5.0, est=4.772623627851539, details={'actual_k': 4, 'was_impossible': False}),
 Prediction(uid=2, iid=2, r_ui=4.0, est=4.0, details=

In [26]:
algo.test(anti_test_set)

[Prediction(uid=0, iid=2, r_ui=3.0, est=4.0, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=0, iid=4, r_ui=3.0, est=1, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=0, iid=6, r_ui=3.0, est=3.0, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=1, iid=4, r_ui=3.0, est=1, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=1, iid=5, r_ui=3.0, est=1.5000000000000002, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=1, iid=6, r_ui=3.0, est=3.0, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=2, iid=3, r_ui=3.0, est=4.0, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=2, iid=6, r_ui=3.0, est=3.0, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=6, iid=2, r_ui=3.0, est=4.0, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=6, iid=3, r_ui=3.0, est=4.0, details={'actual_k': 2,

In [28]:
algo2 = KNNWithMeans(sim_options={'name': 'pearson'})

In [29]:
algo3 = SVD(n_factors=3)

In [32]:
algo4 = NMF(n_factors=100)

# Practice

### Open 'movielens-1m.dat'. Refer to the file 'read_dataset2(ref)' to open this file.

### Practice data processing in this file and try 4 different algorithms.