In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162988 sha256=2884f55e42124d967bc6ecf79db3f6aa343222548d65ea8747435fe2e94b4863
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [3]:
import pandas as pd
import numpy as np
import surprise

In [4]:
ratings = pd.read_csv("ratings.txt",sep=' ',names = ['uid','iid','rating'])
ratings.head()

Unnamed: 0,uid,iid,rating
0,1,1,2.0
1,1,2,4.0
2,1,3,3.5
3,1,4,3.0
4,1,5,4.0


In [5]:
lowest_rating = ratings['rating'].min()
highest_rating = ratings['rating'].max()
print("Ratings range between {0} and {1}".format(lowest_rating,highest_rating))
reader = surprise.Reader(rating_scale = (lowest_rating,highest_rating))

Ratings range between 0.5 and 4.0


Converting the data into **surprise** format:

In [6]:
data = surprise.Dataset.load_from_df(ratings,reader)
type(data)

**Similarity options:**

In case of Item based filtering, `user_based` value will be `False`

In [7]:
similarity_options = {'name': 'cosine', 'user_based': True}

Default K = 40

In [8]:
algo = surprise.KNNBasic(sim_options = similarity_options)
output = algo.fit(data.build_full_trainset())

Computing the cosine similarity matrix...
Done computing similarity matrix.


The above `.fit()` calculates expected rating for all the users

Say we want expected rating of user=50 for item 217

In [9]:
pred = algo.predict(uid='50',iid='217')
pred

Prediction(uid='50', iid='217', r_ui=None, est=3.0028030537791928, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

In [10]:
score = pred.est
print(score)


3.0028030537791928


All the users

In [11]:
iids = ratings['iid'].unique()
iids

array([   1,    2,    3, ..., 2069, 2070, 2071])

In [12]:
len(iids)

2071

Consider ratings given by user=50

In [13]:
user_no=50

In [14]:
rec_u = ratings[ratings['uid'] == user_no ]
iids_u = rec_u['iid']
print("List of iid that uid={0} has rated:".format(user_no))
print(iids_u.values)

List of iid that uid=50 has rated:
[  8 211   3   2 219 234  12 254 250 207  11 253 236  84  10   7 233  13
   1   5   6 252 241 216 257 206   4 217   9 215 213  17 255 220 121 245
 239 251 235]


In [17]:
iids_to_predict = np.setdiff1d(iids,iids_u)
print("List of iid which uid={0} did not rate(in all {1}) :".format(user_no,len(iids_to_predict)))
print(iids_to_predict)

List of iid which uid=50 did not rate(in all 2032) :
[  14   15   16 ... 2069 2070 2071]


In [16]:
len(iids_to_predict)

2032

Create a testset for getting the expected rating

In [18]:
testset = [[user_no,iid,0.] for iid in iids_to_predict]
testset

[[50, 14, 0.0],
 [50, 15, 0.0],
 [50, 16, 0.0],
 [50, 18, 0.0],
 [50, 19, 0.0],
 [50, 20, 0.0],
 [50, 21, 0.0],
 [50, 22, 0.0],
 [50, 23, 0.0],
 [50, 24, 0.0],
 [50, 25, 0.0],
 [50, 26, 0.0],
 [50, 27, 0.0],
 [50, 28, 0.0],
 [50, 29, 0.0],
 [50, 30, 0.0],
 [50, 31, 0.0],
 [50, 32, 0.0],
 [50, 33, 0.0],
 [50, 34, 0.0],
 [50, 35, 0.0],
 [50, 36, 0.0],
 [50, 37, 0.0],
 [50, 38, 0.0],
 [50, 39, 0.0],
 [50, 40, 0.0],
 [50, 41, 0.0],
 [50, 42, 0.0],
 [50, 43, 0.0],
 [50, 44, 0.0],
 [50, 45, 0.0],
 [50, 46, 0.0],
 [50, 47, 0.0],
 [50, 48, 0.0],
 [50, 49, 0.0],
 [50, 50, 0.0],
 [50, 51, 0.0],
 [50, 52, 0.0],
 [50, 53, 0.0],
 [50, 54, 0.0],
 [50, 55, 0.0],
 [50, 56, 0.0],
 [50, 57, 0.0],
 [50, 58, 0.0],
 [50, 59, 0.0],
 [50, 60, 0.0],
 [50, 61, 0.0],
 [50, 62, 0.0],
 [50, 63, 0.0],
 [50, 64, 0.0],
 [50, 65, 0.0],
 [50, 66, 0.0],
 [50, 67, 0.0],
 [50, 68, 0.0],
 [50, 69, 0.0],
 [50, 70, 0.0],
 [50, 71, 0.0],
 [50, 72, 0.0],
 [50, 73, 0.0],
 [50, 74, 0.0],
 [50, 75, 0.0],
 [50, 76, 0.0],
 [50, 77

Generate predictions on testset

In [20]:
predictions = algo.test(testset)
predictions[1000]

Prediction(uid=50, iid=1040, r_ui=0.0, est=3.5, details={'actual_k': 2, 'was_impossible': False})

In [21]:
type(predictions)

list

In [25]:
pred_ratings = np.array([pred.est for pred in predictions])
pred_ratings

array([1.02491128, 2.3010819 , 3.36565625, ..., 2.5       , 3.        ,
       3.        ])

In [26]:
iids_to_predict

array([  14,   15,   16, ..., 2069, 2070, 2071])

In [27]:
df_exp_ratings = pd.DataFrame({'iid':iids_to_predict, 'exp_rating':pred_ratings})
df_exp_ratings.head()

Unnamed: 0,iid,exp_rating
0,14,1.024911
1,15,2.301082
2,16,3.365656
3,18,3.475089
4,19,2.950177


In [30]:
df_exp_ratings.sort_values('exp_rating', ascending=False)

Unnamed: 0,iid,exp_rating
562,602,4.0
1750,1790,4.0
563,603,4.0
561,601,4.0
1753,1793,4.0
...,...,...
1480,1520,0.5
1487,1527,0.5
895,935,0.5
1514,1554,0.5


Top 10 Items' Expected ratings

In [31]:
df_exp_ratings.sort_values('exp_rating', ascending=False).head(10)

Unnamed: 0,iid,exp_rating
562,602,4.0
1750,1790,4.0
563,603,4.0
561,601,4.0
1753,1793,4.0
1756,1796,4.0
540,580,4.0
533,573,4.0
531,571,4.0
1767,1807,4.0


Tuning with different K

In [32]:
np.arange(30,110,20)

array([30, 50, 70, 90])

In [33]:
from surprise.model_selection import GridSearchCV
from surprise.model_selection.split import KFold

param_grid = {'k': np.arange(30,110,20)}
kfold = KFold(n_splits=5, random_state=24, shuffle=True)
gs = GridSearchCV(surprise.KNNBasic, param_grid, measures=['rmse', 'mae'], cv=kfold)

Running the Grid Search CV

In [None]:
gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


**Best Score**

In [None]:
print(gs.best_score['rmse'])

**Best Param**

In [None]:
print(gs.best_params['rmse'])