# Collaborative Filtering on movie dataset


In [1]:
import pandas as pd
import numpy as np

In [2]:
#Load the u.data file into a dataframe
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv('rec-sys-data-req/u.data', sep='\t', names=r_cols,
 encoding='latin-1')

ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


100000 ratings

In [3]:
ratings.shape

(100000, 4)

In [4]:
#Drop the timestamp column
ratings = ratings.drop('timestamp', axis=1)

In [5]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [6]:
ratings.shape

(100000, 3)

In [7]:
ratings.columns

Index(['user_id', 'movie_id', 'rating'], dtype='object')

In [8]:
ratings['user_id'].nunique()

943

In [9]:
ratings['movie_id'].nunique()

1682

In [10]:
# !pip install surprise

In [11]:
#Import the required classes and methods from the surprise library
from surprise import Reader, Dataset, KNNBasic

In [12]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


In [13]:
#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader()

#Create the dataset to be used for building the filter
data = Dataset.load_from_df(ratings, reader)

In [14]:
data

<surprise.dataset.DatasetAutoFolds at 0x1e0fa97ee20>

In [15]:
from surprise.model_selection import cross_validate

In [16]:
from surprise import SVD, accuracy
algo = SVD()

In [17]:
# Run 5-fold cross-validation and then print results
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9398  0.9305  0.9351  0.9328  0.9424  0.9361  0.0044  
Fit time          3.13    3.30    3.14    3.13    3.13    3.17    0.07    
Test time         0.08    0.12    0.09    0.12    0.13    0.11    0.02    


{'test_rmse': array([0.9397532 , 0.93047249, 0.93506181, 0.93275274, 0.94241927]),
 'fit_time': (3.134251594543457,
  3.303744077682495,
  3.137932538986206,
  3.1266586780548096,
  3.1265640258789062),
 'test_time': (0.08404850959777832,
  0.1213982105255127,
  0.08556985855102539,
  0.12496519088745117,
  0.12685775756835938)}

In [18]:
#Define the algorithm object; in this case kNN
knn = KNNBasic()

#Evaluate the performance in terms of RMSE
cross_validate(knn, data, measures=['RMSE'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.97494503, 0.97851819, 0.97959047, 0.98067482, 0.98020485]),
 'fit_time': (0.31931376457214355,
  0.33152294158935547,
  0.36102962493896484,
  0.3479187488555908,
  0.3349435329437256),
 'test_time': (2.0846707820892334,
  2.1070923805236816,
  2.0984959602355957,
  2.1406140327453613,
  2.1309735774993896)}