In [15]:
import pandas as pd
import numpy as np
from surprise.model_selection import GridSearchCV
from surprise.model_selection.split import KFold
import surprise

In [16]:
ratings_df=pd.read_csv('u.data',sep='\t',names=['uid','iid','rating','timestamp'])
ratings_df.head()

Unnamed: 0,uid,iid,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [17]:
lowest_rating=ratings_df['rating'].min()
highest_rating=ratings_df['rating'].max()
print('Rating range between {0} and {1}'.format(lowest_rating,highest_rating))

Rating range between 1 and 5


In [19]:
reader=surprise.Reader(rating_scale=(lowest_rating,highest_rating))
data=surprise.Dataset.load_from_df(ratings_df.drop('timestamp',axis=1),reader)
type(data)

surprise.dataset.DatasetAutoFolds

In [20]:
similarity_options={'name':'cosine','user_based':True}
algo = surprise.KNNBasic(sim_options=similarity_options)
output=algo.fit(data.build_full_trainset())

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [23]:
pred=algo.predict(uid='50',iid='242')
print(pred.est)

3.52986


In [24]:
iids = ratings_df['iid'].unique()
print(iids)

[ 242  302  377 ... 1637 1630 1641]


In [25]:
u_iid = ratings_df[ratings_df['uid']==22]['iid'].unique()
print(u_iid)

[ 377  376  128   80  241  258  510   79  791  511  227  433  173  687
  456  399  186   96  294  780   29  117  683  403  999  502  435  121
  176  222  550  948  393  110  648  118  692  386  226  238  554  407
  878  455  358  208  172  230  997 1001  153  154  194  195  411  168
  204  871  651  665  187   53  515   85  996  451  840  932  167  862
  105  431  231  367  449  202  216  988  184 1003  211   94  175  636
   24  684  229  546  163  109  523   21 1002  201  568  181   17 1000
  566  144    2  161  688  712  384  290  430  210   89  228  732    4
  127  926   62  250  405   50  209  526  174  998  731  385  265  233
  792   68]


In [26]:
iids_to_predict = np.setdiff1d(iids,u_iid)
print(iids_to_predict)

[   1    3    5 ... 1680 1681 1682]


In [28]:
testset = [[22,iid,0.] for iid in iids_to_predict]
testset

[[22, 1, 0.0],
 [22, 3, 0.0],
 [22, 5, 0.0],
 [22, 6, 0.0],
 [22, 7, 0.0],
 [22, 8, 0.0],
 [22, 9, 0.0],
 [22, 10, 0.0],
 [22, 11, 0.0],
 [22, 12, 0.0],
 [22, 13, 0.0],
 [22, 14, 0.0],
 [22, 15, 0.0],
 [22, 16, 0.0],
 [22, 18, 0.0],
 [22, 19, 0.0],
 [22, 20, 0.0],
 [22, 22, 0.0],
 [22, 23, 0.0],
 [22, 25, 0.0],
 [22, 26, 0.0],
 [22, 27, 0.0],
 [22, 28, 0.0],
 [22, 30, 0.0],
 [22, 31, 0.0],
 [22, 32, 0.0],
 [22, 33, 0.0],
 [22, 34, 0.0],
 [22, 35, 0.0],
 [22, 36, 0.0],
 [22, 37, 0.0],
 [22, 38, 0.0],
 [22, 39, 0.0],
 [22, 40, 0.0],
 [22, 41, 0.0],
 [22, 42, 0.0],
 [22, 43, 0.0],
 [22, 44, 0.0],
 [22, 45, 0.0],
 [22, 46, 0.0],
 [22, 47, 0.0],
 [22, 48, 0.0],
 [22, 49, 0.0],
 [22, 51, 0.0],
 [22, 52, 0.0],
 [22, 54, 0.0],
 [22, 55, 0.0],
 [22, 56, 0.0],
 [22, 57, 0.0],
 [22, 58, 0.0],
 [22, 59, 0.0],
 [22, 60, 0.0],
 [22, 61, 0.0],
 [22, 63, 0.0],
 [22, 64, 0.0],
 [22, 65, 0.0],
 [22, 66, 0.0],
 [22, 67, 0.0],
 [22, 69, 0.0],
 [22, 70, 0.0],
 [22, 71, 0.0],
 [22, 72, 0.0],
 [22, 73, 0.0],

In [29]:
predictions = algo.test(testset)

In [30]:
predictions[:5]

[Prediction(uid=22, iid=1, r_ui=0.0, est=3.9263049321165795, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=22, iid=3, r_ui=0.0, est=2.978607120004787, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=22, iid=5, r_ui=0.0, est=3.3493124009469555, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=22, iid=6, r_ui=0.0, est=3.5939200895622343, details={'actual_k': 26, 'was_impossible': False}),
 Prediction(uid=22, iid=7, r_ui=0.0, est=3.948329869569515, details={'actual_k': 40, 'was_impossible': False})]

# Tuning for best k

In [32]:
pararm_grid = {'k': np.arange(20,71,10)}
pararm_grid

{'k': array([20, 30, 40, 50, 60, 70])}

In [33]:
kfold=KFold(n_splits=5,random_state=23,shuffle=True)
gs=GridSearchCV(surprise.KNNBasic, param_grid=pararm_grid, measures=['rmse','mae'],cv=kfold)

In [34]:
gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [35]:
gs.best_score['rmse']

0.9765768260490179

In [38]:
gs.best_params

{'rmse': {'k': 20}, 'mae': {'k': 20}}