In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/772.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m522.2/772.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162999 sha256=95530d99ca338f60ee30bbd44a427de17e25230d053d30f6e2a0a93307ab9bae
  Stored in directory: /root/.cache/pip

In [2]:
import pandas as pd
import numpy as np
import surprise

In [3]:
ratings = pd.read_csv("ratings.txt",sep=' ',names = ['uid','iid','rating'])
ratings.head()

Unnamed: 0,uid,iid,rating
0,1,1,2.0
1,1,2,4.0
2,1,3,3.5
3,1,4,3.0
4,1,5,4.0


In [4]:
lowest_rating = ratings['rating'].min()
highest_rating = ratings['rating'].max()
print("Ratings range between {0} and {1}".format(lowest_rating,highest_rating))
reader = surprise.Reader(rating_scale = (lowest_rating,highest_rating))

Ratings range between 0.5 and 4.0


Converting the data into **surprise** format:

In [5]:
data = surprise.Dataset.load_from_df(ratings,reader)
type(data)

Default Parameters

In [6]:
algo = surprise.SVD(random_state=24)
output = algo.fit(data.build_full_trainset())

The above `.fit()` calculates expected rating for all the users

Say we want expected rating of user=50 for item 217

In [7]:
pred = algo.predict(uid='50',iid='217')
score = pred.est
print(score)

3.0028030537791928


Consider ratings given by user=50

In [9]:
user_no=50

In [10]:
rec_u = ratings[ratings['uid'] == user_no ]
iids_u = rec_u['iid']
print("List of iid that uid={0} has rated:".format(user_no))
print(iids_u.values)

List of iid that uid=50 has rated:
[  8 211   3   2 219 234  12 254 250 207  11 253 236  84  10   7 233  13
   1   5   6 252 241 216 257 206   4 217   9 215 213  17 255 220 121 245
 239 251 235]


In [11]:
iids_to_predict = np.setdiff1d(iids,iids_u)
print("List of iid which uid={0} did not rate(in all {1}) :".format(user_no,len(iids_to_predict)))
print(iids_to_predict)

List of iid which uid=50 did not rate(in all 2032) :
[  14   15   16 ... 2069 2070 2071]


In [12]:
len(iids_to_predict)

2032

Create a testset for getting the expected rating

In [13]:
testset = [[user_no,iid,0.] for iid in iids_to_predict]
testset

[[50, 14, 0.0],
 [50, 15, 0.0],
 [50, 16, 0.0],
 [50, 18, 0.0],
 [50, 19, 0.0],
 [50, 20, 0.0],
 [50, 21, 0.0],
 [50, 22, 0.0],
 [50, 23, 0.0],
 [50, 24, 0.0],
 [50, 25, 0.0],
 [50, 26, 0.0],
 [50, 27, 0.0],
 [50, 28, 0.0],
 [50, 29, 0.0],
 [50, 30, 0.0],
 [50, 31, 0.0],
 [50, 32, 0.0],
 [50, 33, 0.0],
 [50, 34, 0.0],
 [50, 35, 0.0],
 [50, 36, 0.0],
 [50, 37, 0.0],
 [50, 38, 0.0],
 [50, 39, 0.0],
 [50, 40, 0.0],
 [50, 41, 0.0],
 [50, 42, 0.0],
 [50, 43, 0.0],
 [50, 44, 0.0],
 [50, 45, 0.0],
 [50, 46, 0.0],
 [50, 47, 0.0],
 [50, 48, 0.0],
 [50, 49, 0.0],
 [50, 50, 0.0],
 [50, 51, 0.0],
 [50, 52, 0.0],
 [50, 53, 0.0],
 [50, 54, 0.0],
 [50, 55, 0.0],
 [50, 56, 0.0],
 [50, 57, 0.0],
 [50, 58, 0.0],
 [50, 59, 0.0],
 [50, 60, 0.0],
 [50, 61, 0.0],
 [50, 62, 0.0],
 [50, 63, 0.0],
 [50, 64, 0.0],
 [50, 65, 0.0],
 [50, 66, 0.0],
 [50, 67, 0.0],
 [50, 68, 0.0],
 [50, 69, 0.0],
 [50, 70, 0.0],
 [50, 71, 0.0],
 [50, 72, 0.0],
 [50, 73, 0.0],
 [50, 74, 0.0],
 [50, 75, 0.0],
 [50, 76, 0.0],
 [50, 77

Generate predictions on testset

In [14]:
predictions = algo.test(testset)
predictions[1000]

Prediction(uid=50, iid=1040, r_ui=0.0, est=3.5474665031610937, details={'was_impossible': False})

All the users

In [21]:
iids = ratings['iid'].unique()
iids

array([   1,    2,    3, ..., 2069, 2070, 2071])

In [15]:
type(predictions)

list

In [16]:
pred_ratings = np.array([pred.est for pred in predictions])
pred_ratings

array([3.40477316, 3.34743409, 3.70305795, ..., 3.60518767, 3.52211979,
       3.50623449])

In [17]:
iids_to_predict

array([  14,   15,   16, ..., 2069, 2070, 2071])

In [18]:
df_exp_ratings = pd.DataFrame({'iid':iids_to_predict, 'exp_rating':pred_ratings})
df_exp_ratings.head()

Unnamed: 0,iid,exp_rating
0,14,3.404773
1,15,3.347434
2,16,3.703058
3,18,3.543939
4,19,3.405314


In [19]:
df_exp_ratings.sort_values('exp_rating', ascending=False)

Unnamed: 0,iid,exp_rating
295,335,4.000000
267,307,4.000000
618,658,4.000000
235,275,4.000000
246,286,4.000000
...,...,...
792,832,2.831209
660,700,2.785498
694,734,2.771489
413,453,2.765603


Top 10 Items' Expected ratings

In [None]:
df_exp_ratings.sort_values('exp_rating', ascending=False).head(10)

Tuning with different hyper-parameters

In [22]:
from surprise.model_selection import GridSearchCV
from surprise.model_selection.split import KFold

param_grid = {'n_epochs': np.arange(5,50,10),
              'lr_all':np.linspace(0.001,1,5),
              'reg_all': np.linspace(0.01,0.8,5),
              'n_factors':[50,100]}
kfold = KFold(n_splits=5, random_state=2022, shuffle=True)
gs = GridSearchCV(surprise.SVD, param_grid, measures=['rmse', 'mae'], n_jobs=-1,
                  cv=kfold, joblib_verbose=3)

Running the Grid Search CV

In [None]:
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:   21.1s


**Best Score**

In [None]:
print(gs.best_score['rmse'])

**Best Param**

In [None]:
print(gs.best_params['rmse'])