In [None]:
# Reference https://www.jiristodulka.com/post/recsys_cf/

In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 4.3 MB/s eta 0:00:01
[?25hCollecting joblib>=0.11
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
[K     |████████████████████████████████| 306 kB 54.8 MB/s eta 0:00:01
[?25hCollecting numpy>=1.11.2
  Using cached numpy-1.22.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
Collecting scipy>=1.0.0
  Downloading scipy-1.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.6 MB)
[K     |████████████████████████████████| 41.6 MB 64 kB/s s eta 0:00:01     |█████████████████▎              | 22.5 MB 55.7 MB/s eta 0:00:01
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp38-cp38-linux_x86_64.whl size=2324463 sha256=61a08fea7fd703c70209d8773b111fa62ebe302d803a641bbd0d4cd01112e891

In [24]:
from collections import defaultdict
from surprise import SVD, NMF, Dataset, Reader
from surprise.model_selection import GridSearchCV
import pandas as pd
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import pickle

In [4]:
r = urlopen("http://files.grouplens.org/datasets/movielens/ml-latest-small.zip")
zipfile = ZipFile(BytesIO(r.read()))

#print the content of zipfile
zipfile.namelist()

# tidy df ratings (movieId,)
ratings_df = pd.read_csv(zipfile.open('ml-latest-small/ratings.csv'))
print('Columns of ratings_df: {0}'.format(ratings_df.columns))

#movies df (tidy data)
movies_df = pd.read_csv(zipfile.open('ml-latest-small/movies.csv'))
print('Columns of movies_df: {0}'.format(movies_df.columns))

Columns of ratings_df: Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')
Columns of movies_df: Index(['movieId', 'title', 'genres'], dtype='object')


In [5]:
#ratings
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
print(ratings_df.info())
print(ratings_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None
              userId        movieId         rating     timestamp
count  100836.000000  100836.000000  100836.000000  1.008360e+05
mean      326.127564   19435.295718       3.501557  1.205946e+09
std       182.618491   35530.987199       1.042529  2.162610e+08
min         1.000000       1.000000       0.500000  8.281246e+08
25%       177.000000    1199.000000       3.000000  1.019124e+09
50%       325.000000    2991.000000       3.500000  1.186087e+09
75%       477.000000    8122.000000       4.000000  1.435994e+09
max       610.000000  193609.000000       5.000000  1.537799e+09


In [7]:
print("Count of distinct users: ", len(pd.unique(ratings_df["userId"])))
print("Count of distinct movies: ", len(pd.unique(ratings_df["movieId"])))
print("Count of ratings: ", len(ratings_df))

Count of distinct users:  610
Count of distinct movies:  9724
Count of ratings:  100836


In [8]:
#movies
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
# thresholds for removing noise and outliers
min_movie_ratings = 2 # a movie was rated at least this high
min_user_ratings =  5 # a user rated at least this many movies


ratings_flrd_df = ratings_df.groupby("movieId").filter(lambda x: x['movieId'].count() >= min_movie_ratings)
ratings_flrd_df = ratings_flrd_df.groupby("userId").filter(lambda x: x['userId'].count() >= min_user_ratings)



"{0} movies deleted; all movies are now rated at least: {1} times. Old dimensions: {2}; New dimensions: {3}"\
.format(len(ratings_df.movieId.value_counts()) - len(ratings_flrd_df.movieId.value_counts())\
        ,min_movie_ratings,ratings_df.shape, ratings_flrd_df.shape )

'3446 movies deleted; all movies are now rated at least: 2 times. Old dimensions: (100836, 4); New dimensions: (97390, 4)'

In [10]:
print("Count of distinct users: ", len(pd.unique(ratings_flrd_df["userId"])))
print("Count of distinct movies: ", len(pd.unique(ratings_flrd_df["movieId"])))
print("Count of ratings: ", len(ratings_flrd_df))

Count of distinct users:  610
Count of distinct movies:  6278
Count of ratings:  97390


In [11]:
reader = Reader(rating_scale=(0.5, 5)) # line_format by default order of the fields
data = Dataset.load_from_df(ratings_flrd_df[["userId",	"movieId",	"rating"]], reader=reader)

trainset = data.build_full_trainset()

testset = trainset.build_anti_testset()

In [12]:
# Grid Search Cross validation to find the best number of latent factors (n_factors) 
param_grid = {'n_factors': [4,6,9,11,14,18,29]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)
gs.fit(data)


# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.8630628191579026
{'n_factors': 14}


In [15]:
algo_SVD = SVD(n_factors = 11)
algo_SVD.fit(trainset)

# Predict ratings for all pairs (i,j) that are NOT in the training set.
testset = trainset.build_anti_testset()

predictions = algo_SVD.test(testset)

# subset of the list  predictions
predictions[0:2]

[Prediction(uid=1, iid=318, r_ui=3.5110432282575212, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=1704, r_ui=3.5110432282575212, est=4.868860289986284, details={'was_impossible': False})]

In [22]:
def get_top_n(predictions, userId, movies_df, ratings_df, n = 10):
    # Return the recommended top N movieId for a user

    #Part I.: Surprise docomuntation
    
    #1. First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    #2. Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_n[uid] = user_ratings[:n]

    return top_n 

In [23]:
top_n = get_top_n(predictions, movies_df = movies_df, userId = 124, ratings_df = ratings_df)

In [18]:
len(top_n)

610

In [21]:
top_n[610]

[(1204, 4.588546546867592),
 (2324, 4.521837893492778),
 (3275, 4.499612319878602),
 (2019, 4.474092247962394),
 (1266, 4.469898965526995),
 (898, 4.464954843739895),
 (1223, 4.458690280592507),
 (1233, 4.453402506050611),
 (1217, 4.446188045971707),
 (1104, 4.445237616576371)]

In [39]:
def extract_iid(iids_ests):
    return [str(iid) for iid, est in iids_ests]

In [40]:
top_n_movie_ids = {str(k): extract_iid(v) for k, v in top_n.items()}

In [41]:
with open("top_n_movie_ids.pkl", "wb") as fp:
    pickle.dump(top_n_movie_ids, fp)