In [1]:
!pip install scikit-surprise



In [2]:
import surprise
import pandas as pd
import numpy as np
import os

In [3]:
ratings =pd.read_csv('u.data',sep='\t',names = ['uid','iid','rating','timestamp'])
ratings.drop(columns=['timestamp'],axis=1,inplace = True)

In [4]:
ratings

Unnamed: 0,uid,iid,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


In [5]:
lowest_rating = ratings['rating'].min()
highest_rating = ratings['rating'].max()
print("rating ranges between {0} and {1}" .format(lowest_rating, highest_rating))

rating ranges between 1 and 5


In [6]:
reader = surprise.Reader(rating_scale= (lowest_rating,highest_rating))
data = surprise.Dataset.load_from_df(ratings,reader)
type(data)

surprise.dataset.DatasetAutoFolds

In [7]:
similarity_options = {'name':'cosine','user_based': True}
#Default k=40
algo = surprise.KNNBasic(sim_options=similarity_options)
output = algo.fit(data.build_full_trainset())

Computing the cosine similarity matrix...
Done computing similarity matrix.


# Expected rating for user 50 for item 217

In [8]:
pred = algo.predict(uid='100', iid='217')
print(pred.est)

3.52986


In [9]:
pred

Prediction(uid='100', iid='217', r_ui=None, est=3.52986, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

In [10]:
#Total items
iids =ratings['iid'].unique()
print(iids)

[ 242  302  377 ... 1637 1630 1641]


In [11]:
# items rated by UID 50
u_iid =ratings[ratings['uid']==100]['iid'].unique()
print(u_iid)

[ 344  354  268  321  355  750  266  288  302  340  689  905  289  691
  316 1236  342  990  333  752  323  348  313  292 1238  879  300  328
 1235 1237  678  286  908  690  874  880  349  310  347 1234  270 1233
  326  269  258  900  886  294  272  881  895  892  887  885  346  751
  271  898  315]


In [12]:
iids_to_predict  = np.setdiff1d(iids, u_iid)    #Not rated by UID 50
print(iids_to_predict)

[   1    2    3 ... 1680 1681 1682]


In [13]:
len(iids_to_predict)

1623

In [14]:
#Extracting the estimated ratings from iids_to_predict
testset = [[100,iid,0.] for iid in iids_to_predict]
testset

[[100, 1, 0.0],
 [100, 2, 0.0],
 [100, 3, 0.0],
 [100, 4, 0.0],
 [100, 5, 0.0],
 [100, 6, 0.0],
 [100, 7, 0.0],
 [100, 8, 0.0],
 [100, 9, 0.0],
 [100, 10, 0.0],
 [100, 11, 0.0],
 [100, 12, 0.0],
 [100, 13, 0.0],
 [100, 14, 0.0],
 [100, 15, 0.0],
 [100, 16, 0.0],
 [100, 17, 0.0],
 [100, 18, 0.0],
 [100, 19, 0.0],
 [100, 20, 0.0],
 [100, 21, 0.0],
 [100, 22, 0.0],
 [100, 23, 0.0],
 [100, 24, 0.0],
 [100, 25, 0.0],
 [100, 26, 0.0],
 [100, 27, 0.0],
 [100, 28, 0.0],
 [100, 29, 0.0],
 [100, 30, 0.0],
 [100, 31, 0.0],
 [100, 32, 0.0],
 [100, 33, 0.0],
 [100, 34, 0.0],
 [100, 35, 0.0],
 [100, 36, 0.0],
 [100, 37, 0.0],
 [100, 38, 0.0],
 [100, 39, 0.0],
 [100, 40, 0.0],
 [100, 41, 0.0],
 [100, 42, 0.0],
 [100, 43, 0.0],
 [100, 44, 0.0],
 [100, 45, 0.0],
 [100, 46, 0.0],
 [100, 47, 0.0],
 [100, 48, 0.0],
 [100, 49, 0.0],
 [100, 50, 0.0],
 [100, 51, 0.0],
 [100, 52, 0.0],
 [100, 53, 0.0],
 [100, 54, 0.0],
 [100, 55, 0.0],
 [100, 56, 0.0],
 [100, 57, 0.0],
 [100, 58, 0.0],
 [100, 59, 0.0],
 [100,

In [15]:
testset = [[100,iid,0.] for iid in iids_to_predict]
predictions = algo.test(testset)
predictions[:5]

[Prediction(uid=100, iid=1, r_ui=0.0, est=4.4499638991027775, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=100, iid=2, r_ui=0.0, est=3.1746676342628377, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=100, iid=3, r_ui=0.0, est=3.223974938384731, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=100, iid=4, r_ui=0.0, est=3.5747654705675784, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=100, iid=5, r_ui=0.0, est=3.4745866651729056, details={'actual_k': 40, 'was_impossible': False})]

In [16]:
len(predictions)

1623

In [17]:
(predictions[4].iid,predictions[4].est)

(5, 3.4745866651729056)

In [18]:
# for i in len(predictions):
#     (predictions[i].iid,predictions[i].est)

In [19]:
pred_ratings=[(predictions[i].iid,predictions[i].est) for i in range(0,len(predictions))]
predicted_rating=pd.DataFrame(pred_ratings,columns=['iid','est_rating'])
exp_ratings=predicted_rating.sort_values(by='est_rating',ascending=False)

In [20]:
sorted_exp=exp_ratings.sort_values(by=['est_rating','iid'],ascending=[False,True])
sorted_exp.head(10)

Unnamed: 0,iid,est_rating
774,814,5.0
1068,1122,5.0
1135,1189,5.0
1147,1201,5.0
1233,1293,5.0
1407,1467,5.0
1440,1500,5.0
1476,1536,5.0
1539,1599,5.0
1593,1653,5.0


Tuning for best k

In [21]:
from surprise.model_selection import GridSearchCV
from surprise.model_selection.split import KFold

In [22]:
param_grid={'k':[20,30,40,50,60,70],'user_based':[True,False]}
param_grid

{'k': [20, 30, 40, 50, 60, 70], 'user_based': [True, False]}

In [23]:
kfold=KFold(n_splits=5,random_state=23,shuffle=True)
gs=GridSearchCV(surprise.KNNBasic,param_grid,measures=['rmse','mae'],cv=kfold)

In [24]:
gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [25]:
gs.best_score['rmse']

0.9765768260490179

In [26]:
gs.best_score['mae']

0.7702810060041623

In [27]:
print(gs.best_params['rmse'])

{'k': 20, 'user_based': True}


In [28]:
print(gs.best_params['mae'])

{'k': 20, 'user_based': True}


In [29]:
gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f47550a0950>

In [30]:
gs.best_estimator['mae']
algo.fit(data.build_full_trainset())

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f47550a0950>

# ITEM BASED


In [31]:
param_grid={'k':[20,30,40,50,60,70],'sim_options':{'name':['cosine'],'user_based':[False]}}
param_grid

kfold=KFold(n_splits=5,random_state=23,shuffle=True)
gs=GridSearchCV(surprise.KNNBasic,param_grid,measures=['rmse','mae'],cv=kfold)

In [32]:
gs.fit(data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

In [33]:
gs.best_score['rmse']

1.0240756703702425

In [34]:
print(gs.best_params['rmse'])

{'k': 60, 'sim_options': {'name': 'cosine', 'user_based': False}}


In [35]:
gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f47550a0950>

In [39]:
movies =pd.read_csv('u.item',sep='|',encoding='latin-1',header=None)
movies=movies.iloc[:,:3]
movies.columns=['iid','movie','release_date']
movies.head()

Unnamed: 0,iid,movie,release_date
0,1,Toy Story (1995),01-Jan-1995
1,2,GoldenEye (1995),01-Jan-1995
2,3,Four Rooms (1995),01-Jan-1995
3,4,Get Shorty (1995),01-Jan-1995
4,5,Copycat (1995),01-Jan-1995


In [40]:
sorted_exp=sorted_exp.merge(movies,on='iid')
sorted_exp.head(10)

Unnamed: 0,iid,est_rating,movie,release_date
0,814,5.0,"Great Day in Harlem, A (1994)",01-Jan-1994
1,1122,5.0,They Made Me a Criminal (1939),01-Jan-1939
2,1189,5.0,Prefontaine (1997),24-Jan-1997
3,1201,5.0,Marlene Dietrich: Shadow and Light (1996),02-Apr-1996
4,1293,5.0,Star Kid (1997),16-Jan-1998
5,1467,5.0,"Saint of Fort Washington, The (1993)",01-Jan-1993
6,1500,5.0,Santa with Muscles (1996),08-Nov-1996
7,1536,5.0,Aiqing wansui (1994),22-Jul-1996
8,1599,5.0,Someone Else's America (1995),10-May-1996
9,1653,5.0,Entertaining Angels: The Dorothy Day Story (1996),27-Sep-1996
