In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from recommenders import ClusteringBasedRecommender, Data
from sklearn.cluster import KMeans, DBSCAN

In [14]:
import random 
def test_error(epochs, recommender,data:Data, seed=196):
    random.seed(seed)
    rand_seeds = [random.randint(0, 1000) for _ in range(epochs)]
    avg_error = 0
    for i in tqdm(range(epochs)):
        test_data = data.test_data.sample(n=1000, random_state=rand_seeds[i])
        test_data['predicted_rating'] = test_data.apply(
            lambda row: recommender.predict(int(row['userId']), int(row['movieId'])), axis=1
            )
        avg_error += np.mean(np.abs(test_data['rating'] - test_data['predicted_rating']))
    return avg_error / epochs

In [3]:
errors_kmeans = []
errors_dbscan = []

clusters = [5,20,40,80]
epsilons = [0.5, 0.3, 0.1, 0.7]
min_samples = [5, 10, 20,3]
seed = 124

data = Data('ml-latest-small')

for n_cluster, eps,min_samp in zip(clusters, epsilons, min_samples):
    kmeans = ClusteringBasedRecommender(data=data.train_data_table_for_clustering,
                                        movie_genres=data.movie_genres,
                                        Clusterer=KMeans,
                                        clusterer_params={'n_clusters': n_cluster, "random_state": seed})
    kmeans.train()
    errors_kmeans.append(test_error(5, kmeans,data,seed))

    dbscan = ClusteringBasedRecommender(data=data.train_data_table_for_clustering,
                                        movie_genres=data.movie_genres,
                                        Clusterer=DBSCAN,
                                        clusterer_params={'eps':eps, 'min_samples': min_samp})
    dbscan.train()
    errors_dbscan.append(test_error(5, dbscan,data,seed))

print("kmeans:", errors_kmeans)
print("dbscan:", errors_dbscan)

100%|██████████| 5/5 [00:03<00:00,  1.31it/s]
100%|██████████| 5/5 [00:03<00:00,  1.39it/s]
100%|██████████| 5/5 [00:03<00:00,  1.51it/s]
100%|██████████| 5/5 [00:03<00:00,  1.49it/s]
100%|██████████| 5/5 [00:03<00:00,  1.54it/s]
100%|██████████| 5/5 [00:03<00:00,  1.44it/s]
100%|██████████| 5/5 [00:03<00:00,  1.51it/s]
100%|██████████| 5/5 [00:03<00:00,  1.39it/s]

kmeans: [0.7529760401586489, 0.7354300230780361, 0.729899393236475, 0.727501673133021]
dbscan: [0.8331100383769316, 0.8331100383769316, 0.8331100383769316, 0.8285061293098144]



