In [1]:
!pip install scikit-surprise
# runtime reset required!

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
from surprise import SVD, Dataset, accuracy
from surprise.model_selection import train_test_split
import pandas as pd
import numpy as np

RANDOM_STATE = 918

## Builtin Dataset (MovieLens)

In [3]:
data = Dataset.load_builtin('ml-100k'); 


Dataset ml-100k could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


In [6]:
trainset, testset = train_test_split(data, test_size = 0.2, random_state = 918)
print(trainset)


<surprise.trainset.Trainset object at 0x7f897ce3b410>


In [8]:
df = pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])
df

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596
...,...,...,...,...
99995,880,476,3.0,880175444
99996,716,204,5.0,879795543
99997,276,1090,1.0,874795795
99998,13,225,2.0,882399156


In [9]:
svd = SVD(random_state = 918)
svd.fit(trainset)

preds = svd.test(testset)
print('prediction type : ', type(preds), 'size : ', len(preds))
preds[:5]

prediction type :  <class 'list'> size :  20000


[Prediction(uid='847', iid='180', r_ui=2.0, est=3.0587762422369478, details={'was_impossible': False}),
 Prediction(uid='64', iid='101', r_ui=2.0, est=3.22391747590186, details={'was_impossible': False}),
 Prediction(uid='750', iid='294', r_ui=4.0, est=2.8050087796584218, details={'was_impossible': False}),
 Prediction(uid='264', iid='792', r_ui=5.0, est=4.212364630147739, details={'was_impossible': False}),
 Prediction(uid='875', iid='527', r_ui=4.0, est=4.411060278028524, details={'was_impossible': False})]

In [11]:
[ (pred.uid, pred.iid, pred.est) for pred in preds[:5]]

[('847', '180', 3.0587762422369478),
 ('64', '101', 3.22391747590186),
 ('750', '294', 2.8050087796584218),
 ('264', '792', 4.212364630147739),
 ('875', '527', 4.411060278028524)]

In [13]:
uid = '196' # must be str
iid = '302'
pred = svd.predict(uid, iid)
print(pred)

user: 196        item: 302        r_ui = None   est = 4.10   {'was_impossible': False}


In [14]:
accuracy.rmse(preds)

RMSE: 0.9350


0.9349545229437336

## General DataSet     

https://surprise.readthedocs.io/en/stable/dataset.html

$$ Dataset.load\_from\_file(file\_path, reader)  $$
$$ Dataset.load\_from\_df(df, reader)   $$
( reader -> file format )

+ [uid, iid, rating] format only (surplus cut)




In [19]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from tqdm import tqdm_notebook as tqdm

from surprise import Reader, Datase 

In [28]:
PATH = '/content/drive/MyDrive/study/Recsys/data/movielens'

ratings_df = pd.read_csv(os.path.join(PATH, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(PATH, 'movies.csv'), encoding='utf-8')


In [22]:
reader = Reader(rating_scale = (0.5, 5.0))
ratings = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
tempdf = pd.DataFrame(ratings.__dict__['raw_ratings'], columns=['user_id','item_id','rating', 'timestamp'])

print(ratings_df.shape)
print(ratings_df.head())

print( "\n\n After transformed \n\n")

print(ratings)
print(tempdf)

(100836, 4)
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


 After transformed 


<surprise.dataset.DatasetAutoFolds object at 0x7f8975db9c10>
        user_id  item_id  rating timestamp
0             1        1     4.0      None
1             1        3     4.0      None
2             1        6     4.0      None
3             1       47     5.0      None
4             1       50     5.0      None
...         ...      ...     ...       ...
100831      610   166534     4.0      None
100832      610   168248     5.0      None
100833      610   168250     5.0      None
100834      610   168252     5.0      None
100835      610   170875     3.0      None

[100836 rows x 4 columns]


In [25]:
trainset, testset = train_test_split(ratings, test_size = 0.2, random_state = RANDOM_STATE)

svd = SVD(n_factors = 50, random_state= RANDOM_STATE)
svd.fit(trainset)
preds = svd.test(testset)
accuracy.rmse(preds)


RMSE: 0.8621


0.8621409576630282

In [26]:
from surprise.model_selection import cross_validate

cross_validate(svd, ratings, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8702  0.8662  0.8715  0.8815  0.8685  0.8716  0.0053  
MAE (testset)     0.6683  0.6643  0.6708  0.6751  0.6680  0.6693  0.0036  
Fit time          3.46    3.43    3.48    3.52    3.54    3.48    0.04    
Test time         0.29    0.19    0.32    0.19    0.36    0.27    0.07    


{'test_rmse': array([0.87017819, 0.86623389, 0.87150811, 0.88146792, 0.86845929]),
 'test_mae': array([0.66826464, 0.6643076 , 0.6707677 , 0.67506718, 0.66801947]),
 'fit_time': (3.457099676132202,
  3.4284398555755615,
  3.4756920337677,
  3.518197536468506,
  3.541332483291626),
 'test_time': (0.28554630279541016,
  0.18962430953979492,
  0.32198071479797363,
  0.1920156478881836,
  0.355499267578125)}

In [40]:
from surprise.dataset import DatasetAutoFolds

reader = Reader(line_format = 'user item rating timestamp', sep = ',', rating_scale = (0.5, 5), skip_lines=1)
data_folds = DatasetAutoFolds(ratings_file = os.path.join(PATH, 'ratings.csv'), reader = reader )

trainset = data_folds.build_full_trainset()

In [43]:
svd = SVD(n_epochs = 20, n_factors = 50, random_state = RANDOM_STATE)
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f897655c3d0>

In [41]:
def get_unseen(ratings, movies, userId):
    seen_movies = ratings[ratings['userId'] == userId]['movieId'].tolist()

    total_movies = movies['movieId'].tolist()

    unseen_movies = [movie for movie in total_movies if movie not in seen_movies]

    return unseen_movies

def recommend_movie(model, userId, unseen_movies, top_n = 10):

    preds = [model.predict(str(userId), str(movieId)) for movieId in unseen_movies]

    # preds = [Prediction(uid = '9', iid='1', est=0.39), .... ]
    # sort by est
    comp = lambda x : x.est
    preds.sort(key = comp, reverse = True)

    top_preds = preds[:top_n]

    top_movie_ids = [ int(pred.iid) for pred in top_preds]
    top_movie_rating = [ pred.est for pred in top_preds]
    top_movie_titles = movies_df[movies_df.movieId.isin(top_movie_ids)]['title']
    top_movie_preds = [(id, title, rating) for id, title, rating in \
                       zip(top_movie_ids, top_movie_titles, top_movie_rating)]

    return top_movie_preds

In [48]:
target_userId = 9

unseen_movies = get_unseen(ratings_df, movies_df, target_userId)
top_movie_preds = recommend_movie(svd, 9, unseen_movies)

print( '------------------ TOP 10 RECOMMEND LIST -------------------')
for top_movie in top_movie_preds:
    print(f'{top_movie[1]:>80} : {top_movie[2]:<30}')




------------------ TOP 10 RECOMMEND LIST -------------------
                                                Shawshank Redemption, The (1994) : 4.383037523491732             
     Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) : 4.195005938641898             
                                     Wallace & Gromit: The Wrong Trousers (1993) : 4.175809995047691             
                                          One Flew Over the Cuckoo's Nest (1975) : 4.16040146282391              
                                                       Lawrence of Arabia (1962) : 4.159456340057369             
                                                               Goodfellas (1990) : 4.15428013197644              
                                                           Cool Hand Luke (1967) : 4.142171490851776             
                                                         Double Indemnity (1944) : 4.1311515835432395            
                           