## Collaborative Based Filtering

### Load the Data

In [6]:
import pandas as pd
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')[['userId', 'movieId', 'rating']]

In [7]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0
...,...,...,...
99999,671,6268,2.5
100000,671,6269,4.0
100001,671,6365,4.0
100002,671,6385,2.5


## Create the dataset

In [15]:
from surprise import Dataset, Reader

# Specifying the range of ratings in our dataset via the Reader class
reader = Reader(rating_scale=(1,5))

dataset = Dataset.load_from_df(ratings, reader)
dataset

<surprise.dataset.DatasetAutoFolds at 0x7f124fa36550>

## Building training set

In [16]:
trainset =  dataset.build_full_trainset()

In [17]:
list(trainset.all_ratings())

[(0, 0, 2.5),
 (0, 1, 3.0),
 (0, 2, 3.0),
 (0, 3, 2.0),
 (0, 4, 4.0),
 (0, 5, 2.0),
 (0, 6, 2.0),
 (0, 7, 2.0),
 (0, 8, 3.5),
 (0, 9, 2.0),
 (0, 10, 2.5),
 (0, 11, 1.0),
 (0, 12, 4.0),
 (0, 13, 4.0),
 (0, 14, 3.0),
 (0, 15, 2.0),
 (0, 16, 2.0),
 (0, 17, 2.5),
 (0, 18, 1.0),
 (0, 19, 3.0),
 (1, 20, 4.0),
 (1, 21, 5.0),
 (1, 22, 5.0),
 (1, 23, 4.0),
 (1, 24, 4.0),
 (1, 25, 3.0),
 (1, 26, 3.0),
 (1, 27, 4.0),
 (1, 28, 3.0),
 (1, 29, 5.0),
 (1, 30, 4.0),
 (1, 31, 3.0),
 (1, 32, 3.0),
 (1, 33, 3.0),
 (1, 34, 3.0),
 (1, 35, 3.0),
 (1, 36, 3.0),
 (1, 37, 5.0),
 (1, 38, 1.0),
 (1, 39, 3.0),
 (1, 40, 3.0),
 (1, 41, 3.0),
 (1, 42, 4.0),
 (1, 43, 4.0),
 (1, 44, 5.0),
 (1, 45, 5.0),
 (1, 46, 3.0),
 (1, 47, 4.0),
 (1, 48, 3.0),
 (1, 49, 4.0),
 (1, 50, 3.0),
 (1, 51, 4.0),
 (1, 52, 2.0),
 (1, 53, 1.0),
 (1, 54, 3.0),
 (1, 55, 4.0),
 (1, 56, 4.0),
 (1, 57, 3.0),
 (1, 58, 3.0),
 (1, 59, 3.0),
 (1, 60, 3.0),
 (1, 61, 2.0),
 (1, 62, 3.0),
 (1, 63, 3.0),
 (1, 64, 3.0),
 (1, 65, 3.0),
 (1, 66, 2.0),
 (1, 

## Training the Model

In [18]:
from surprise import SVD

svd = SVD()

In [19]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f1271d318d0>

In [53]:
ratings[ratings['userId']==15]

Unnamed: 0,userId,movieId,rating
962,15,1,2.0
963,15,2,2.0
964,15,5,4.5
965,15,6,4.0
966,15,10,3.0
...,...,...,...
2657,15,160271,2.5
2658,15,160563,1.0
2659,15,160565,2.0
2660,15,160567,4.0


In [52]:
# Testing model for (userId, movieId) from ratings dataset
 
test_rating = ratings.loc[(ratings['userId']==15) & (ratings['movieId']==1956), 'rating'].item()
ratings[test_user & test_movie]

Unnamed: 0,userId,movieId,rating
1379,15,1956,4.0


In [50]:
score = svd.predict(15, 1956).est
print(f"Model Guess: {score}")
print(f"Guess Accuracy %: {(score / test_rating) * 100}")

Model Guess: 3.4148728119192553
Guess Accuracy %: 85.37182029798139


In [55]:
# Predicting rating given by user 15 for random movie id
random_rating = svd.predict(15, 3).est
print(f"Model Guess: {random_rating}")

Model Guess: 2.084440534223795


## Validating Model

In [23]:
from surprise import model_selection

# Checking model performance
model_selection.cross_validate(svd, dataset, measures=['RMSE', 'MAE'])

{'test_rmse': array([0.89228817, 0.88671952, 0.90685887, 0.8981757 , 0.90391414]),
 'test_mae': array([0.68624521, 0.68126913, 0.69926971, 0.6889442 , 0.69704241]),
 'fit_time': (0.8427402973175049,
  0.837303876876831,
  0.8253676891326904,
  0.8283286094665527,
  0.8428876399993896),
 'test_time': (0.0701601505279541,
  0.07283735275268555,
  0.20593714714050293,
  0.07574295997619629,
  0.07784366607666016)}

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2b4c37fb-98c9-4796-92e9-b15a63b179e7' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>