In [1]:
import pandas as pd
import numpy as np

from surprise import Dataset
from surprise import Reader
from surprise.dataset import BUILTIN_DATASETS #с помощью данного объекта мы можем использовать встроенные датасеты
from surprise import model_selection
from surprise import SVD, KNNBasic, accuracy

In [2]:
data = Dataset.load_from_file(
    "data/u.data.txt",
    reader=Reader(line_format="user item rating timestamp", sep="\t"),
)

In [3]:
df = pd.DataFrame(data.raw_ratings, columns=['userId', 'movieId', 'rating', 'timestamp'])
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [4]:
df.movieId.nunique()

1682

In [5]:
df.userId.nunique()

943

In [6]:
df.rating.value_counts()

4.0    34174
3.0    27145
5.0    21201
2.0    11370
1.0     6110
Name: rating, dtype: int64

In [7]:
trainset, testset = model_selection.train_test_split(data, test_size=0.25, random_state=13)

In [8]:
len(testset)

25000

In [9]:
sim_options = {
    'name': 'cosine',
    'user_based': False
}
 
knn = KNNBasic(sim_options=sim_options)

In [10]:
knn.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x11f6441c0>

In [11]:
predictions = knn.test(testset)
predictions

[Prediction(uid='7', iid='633', r_ui=5.0, est=4.199452349030111, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='422', iid='287', r_ui=3.0, est=3.4703437660463736, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='804', iid='163', r_ui=3.0, est=3.5716736533692854, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='189', iid='480', r_ui=5.0, est=4.222825780855538, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='238', iid='546', r_ui=3.0, est=3.473417286928204, details={'actual_k': 17, 'was_impossible': False}),
 Prediction(uid='804', iid='216', r_ui=4.0, est=3.922551907749182, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='350', iid='204', r_ui=4.0, est=4.345238219480267, details={'actual_k': 38, 'was_impossible': False}),
 Prediction(uid='708', iid='993', r_ui=4.0, est=3.4458505791534115, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='193', iid='1078', r_ui=4.0, es

In [12]:
df_pred = pd.DataFrame(predictions)
df_pred.head()

Unnamed: 0,uid,iid,r_ui,est,details
0,7,633,5.0,4.199452,"{'actual_k': 40, 'was_impossible': False}"
1,422,287,3.0,3.470344,"{'actual_k': 40, 'was_impossible': False}"
2,804,163,3.0,3.571674,"{'actual_k': 40, 'was_impossible': False}"
3,189,480,5.0,4.222826,"{'actual_k': 40, 'was_impossible': False}"
4,238,546,3.0,3.473417,"{'actual_k': 17, 'was_impossible': False}"


In [13]:
df_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   uid      25000 non-null  object 
 1   iid      25000 non-null  object 
 2   r_ui     25000 non-null  float64
 3   est      25000 non-null  float64
 4   details  25000 non-null  object 
dtypes: float64(2), object(3)
memory usage: 976.7+ KB


In [14]:
df_pred[(df_pred['uid'] == '500') & (df_pred['iid'] == '699')]

Unnamed: 0,uid,iid,r_ui,est,details
946,500,699,3.0,3.47479,"{'actual_k': 40, 'was_impossible': False}"


In [15]:
accuracy.rmse(predictions)

RMSE: 1.0272


1.0271678039029761

In [16]:
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],inplace=True,ascending = False)

In [17]:
recom = pred[pred.uid =='849']['iid'].to_list()

In [18]:
recom

['234', '427', '568', '174']

In [19]:
sim_options1 = {
    'name': 'cosine',
    'user_based': True
}
 
knn1 = KNNBasic(sim_options=sim_options1)

In [20]:
knn1.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x120ac3ac0>

In [21]:
predictions1 = knn1.test(testset)
predictions1

[Prediction(uid='7', iid='633', r_ui=5.0, est=4.150997362033697, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='422', iid='287', r_ui=3.0, est=3.754016750770759, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='804', iid='163', r_ui=3.0, est=3.70096140390557, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='189', iid='480', r_ui=5.0, est=4.524549229206855, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='238', iid='546', r_ui=3.0, est=3.1991812561559425, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='804', iid='216', r_ui=4.0, est=3.9500160918588327, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='350', iid='204', r_ui=4.0, est=4.1983705425936275, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='708', iid='993', r_ui=4.0, est=3.6795202365264075, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='193', iid='1078', r_ui=4.0, es

In [22]:
accuracy.rmse(predictions1)

RMSE: 1.0175


1.0174852296380237

In [23]:
svd_mod = SVD()
svd_mod.fit(trainset)
predictions2 = svd_mod.test(testset)
accuracy.rmse(predictions2)

RMSE: 0.9410


0.9410154142852298