In [2]:
import pandas as pd

In [3]:
rating_data = pd.read_csv('ratings.csv')

rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
min_ratings = 5         # minimum number of ratings each movieId receives
min_user_ratings = 5    # minimum number of movieIds given a rating by the user

In [5]:
# Look for movieIds whose total ratings are > min_ratings
cond_movie = rating_data['movieId'].value_counts() > min_ratings

# Output the filtered movie_id
filtered_movie_id = (
     cond_movie[cond_movie] # 1. filter movieId that meets the conditions above
     .index                 # 2. extract its movieId (its index)
     .tolist()              # 3. then save it in a list
)

In [6]:
cond_user = rating_data['userId'].value_counts() > min_user_ratings

# Output the filtered movie_id
filtered_user_id = (
     cond_user[cond_user] # 1. filter userIds that meet the conditions above
     .index               # 2. extract its userId (its index)
     .tolist()            # 3. then save it in a list
)

In [7]:
# Create the final dataset based on the conditions above
rating_data_final = (
     rating_data[                                         # Filter rating data complies
         (rating_data['movieId'].isin(filtered_movie_id)) # cond 1: corresponding movie Id filtered_movie_id
         &                                                # AND
         (rating_data['userId'].isin(filtered_user_id))   # cond 2: corresponding user Id filtered_user_id
     ]
)

rating_data_final

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100830,610,166528,4.0,1493879365
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047


In [59]:
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD, KNNBasic, CoClustering, SlopeOne, KNNBaseline
from sklearn.metrics import average_precision_score

In [10]:
# Create a reader object
reader = Reader(rating_scale=(1, 5)) # Define the rating scale. We use a scale of 1-5

# Create a data object
# This object was created to enable cross-validation of the model
data = Dataset.load_from_df(df = rating_data_final[['userId', 'movieId', 'rating']], # Insert data
                             reader = reader)    

In [60]:
# Initialize algorithms
algorithms = [
    SVD(),
    KNNBasic(),
    KNNBaseline(),
    CoClustering(),
    SlopeOne()
]

In [61]:
# Define a list to store results
results_list = []

In [62]:
# Define a cross-validation procedure for each number of folds
for cv_value in [5, 10]:
    for algo in algorithms:
        results = cross_validate(algo, data, measures=['RMSE', 'MAE'],
                                 cv=cv_value, verbose=True, n_jobs=-1, return_train_measures=True)

        # Store results in list
        results_list.append({
            'Algorithm': algo.__class__.__name__,
            'cv': cv_value,
            'RMSE': sum(results['test_rmse']) / len(results['test_rmse']),
            'Fit Time': sum(results['fit_time']) / len(results['fit_time']),
            'Train Time': sum(results['test_time']) / len(results['test_time']),
            'Test Time': sum(results['test_time']) / len(results['test_time']),
            'Predict Time': sum(results['test_time']) / len(results['test_time']),
            'Precision': sum(results['test_rmse']) / len(results['test_rmse']),
            'Recall': sum(results['test_mae']) / len(results['test_mae'])
        })

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8548  0.8613  0.8613  0.8529  0.8562  0.8573  0.0034  
MAE (testset)     0.6564  0.6601  0.6574  0.6555  0.6576  0.6574  0.0015  
RMSE (trainset)   0.6338  0.6328  0.6339  0.6345  0.6347  0.6339  0.0007  
MAE (trainset)    0.4912  0.4896  0.4916  0.4911  0.4918  0.4911  0.0008  
Fit time          0.53    0.53    0.51    0.52    0.53    0.52    0.01    
Test time         0.16    0.15    0.12    0.10    0.10    0.13    0.02    
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9184  0.9137  0.9194  0.9036  0.9097  0.9130  0.0058  
MAE (testset)     0.7044  0.7034  0.7050  0.6949  0.7003  0.7016  0.0037  
RMSE (trainset)   0.7453  0.7469  0.7464  0.7477  0.7462  0.7465  0.0008  
MAE (trainset)    0.5714  0.5725  0.5717  0.5732  0.5721  0.57

In [63]:
results_df = pd.DataFrame(results_list)

In [64]:
results_df

Unnamed: 0,Algorithm,cv,RMSE,Fit Time,Train Time,Test Time,Predict Time,Precision,Recall
0,SVD,5,0.857321,0.524266,0.12613,0.12613,0.12613,0.857321,0.65738
1,KNNBasic,5,0.912959,0.149047,0.992916,0.992916,0.992916,0.912959,0.701611
2,KNNBaseline,5,0.846976,0.231621,1.947362,1.947362,1.947362,0.846976,0.647977
3,CoClustering,5,0.910348,1.30728,0.116544,0.116544,0.116544,0.910348,0.70984
4,SlopeOne,5,0.863492,1.500722,3.179255,3.179255,3.179255,0.863492,0.661019
5,SVD,10,0.850185,1.092703,0.088182,0.088182,0.088182,0.850185,0.65133
6,KNNBasic,10,0.907249,0.24241,0.715679,0.715679,0.715679,0.907249,0.697047
7,KNNBaseline,10,0.843373,0.37888,1.183878,1.183878,1.183878,0.843373,0.644632
8,CoClustering,10,0.905142,1.649803,0.082097,0.082097,0.082097,0.905142,0.704658
9,SlopeOne,10,0.859503,2.675148,2.436526,2.436526,2.436526,0.859503,0.657234


In [65]:
sorted_results_df = results_df.sort_values(by='RMSE', ascending=True)
sorted_results_df

Unnamed: 0,Algorithm,cv,RMSE,Fit Time,Train Time,Test Time,Predict Time,Precision,Recall
7,KNNBaseline,10,0.843373,0.37888,1.183878,1.183878,1.183878,0.843373,0.644632
2,KNNBaseline,5,0.846976,0.231621,1.947362,1.947362,1.947362,0.846976,0.647977
5,SVD,10,0.850185,1.092703,0.088182,0.088182,0.088182,0.850185,0.65133
0,SVD,5,0.857321,0.524266,0.12613,0.12613,0.12613,0.857321,0.65738
9,SlopeOne,10,0.859503,2.675148,2.436526,2.436526,2.436526,0.859503,0.657234
4,SlopeOne,5,0.863492,1.500722,3.179255,3.179255,3.179255,0.863492,0.661019
8,CoClustering,10,0.905142,1.649803,0.082097,0.082097,0.082097,0.905142,0.704658
6,KNNBasic,10,0.907249,0.24241,0.715679,0.715679,0.715679,0.907249,0.697047
3,CoClustering,5,0.910348,1.30728,0.116544,0.116544,0.116544,0.910348,0.70984
1,KNNBasic,5,0.912959,0.149047,0.992916,0.992916,0.992916,0.912959,0.701611
