In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from recommenders import ClusteringBasedRecommender,ClusteringAndAprioriBasedRecommender, Data, Apriori
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering, OPTICS, Birch
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer

In [4]:
import random 
def test_error(epochs, recommender,data:Data, seed=196, sample_size=1000):
    random.seed(seed)
    rand_seeds = [random.randint(0, 1000) for _ in range(epochs)]
    avg_error = 0
    for i in range(epochs):
        test_data = data.test_data.sample(n=sample_size, random_state=rand_seeds[i])
        test_data['predicted_rating'] = test_data.apply(
            lambda row: recommender.predict(int(row['userId']), int(row['movieId'])), axis=1
            )
        avg_error += np.mean(np.abs(test_data['rating'] - test_data['predicted_rating']))
    return avg_error / epochs

In [3]:
'''testing the clustering based reccomender on different combinations'''
imputers = [SimpleImputer(strategy='mean'),
            SimpleImputer(strategy='median'),
            KNNImputer(n_neighbors=4, weights='uniform')]
preprocessors = [[], [MinMaxScaler()],
                 [StandardScaler()], [Normalizer()],
                 [StandardScaler()],
                 [MinMaxScaler(), Normalizer()]]
clusterers = [KMeans(n_clusters=20), 
              DBSCAN(eps=0.5, min_samples=5),
              AgglomerativeClustering(n_clusters=20),
              SpectralClustering(n_clusters=20),
              OPTICS(min_samples=5),
              Birch(n_clusters=10)]
errors = {}

for i,imputer in enumerate(tqdm(imputers)):
    for preprocessor in preprocessors:
        for clusterer in clusterers:
            data = Data('ml-latest-small', 
                        imputer=imputer,
                        preprocessors=preprocessor)

            recommender = ClusteringBasedRecommender(
                data=data.train_data_table_for_clustering_normalized,
                data_unnormalized=data.train_data_table_for_clustering,
                movie_genres=data.movie_genres,
                clusterer=clusterer
                )
            recommender.train()
            errors[(imputer, tuple(preprocessor), clusterer)] = test_error(2, recommender,data,2, sample_size=100)


  0%|          | 0/3 [00:00<?, ?it/s]

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
100%|█████

In [4]:
print("average error when predicting ratings:")
for key, value in sorted(errors.items(), key=lambda x: x[1]):
    print(value, key)

average error when predicting ratings:
0.6525000000000001 (SimpleImputer(strategy='median'), (), AgglomerativeClustering(n_clusters=20))
0.6575 (SimpleImputer(), (), Birch(n_clusters=10))
0.6575 (SimpleImputer(), (StandardScaler(),), KMeans(n_clusters=20))
0.6575 (KNNImputer(n_neighbors=4), (MinMaxScaler(),), AgglomerativeClustering(n_clusters=20))
0.66 (SimpleImputer(strategy='median'), (StandardScaler(),), KMeans(n_clusters=20))
0.665 (KNNImputer(n_neighbors=4), (), KMeans(n_clusters=20))
0.6675 (SimpleImputer(), (), KMeans(n_clusters=20))
0.6699999999999999 (SimpleImputer(strategy='median'), (MinMaxScaler(),), AgglomerativeClustering(n_clusters=20))
0.67 (KNNImputer(n_neighbors=4), (), Birch(n_clusters=10))
0.6725000000000001 (SimpleImputer(strategy='median'), (), KMeans(n_clusters=20))
0.675 (SimpleImputer(strategy='median'), (), Birch(n_clusters=10))
0.6775 (SimpleImputer(), (MinMaxScaler(),), KMeans(n_clusters=20))
0.6775 (KNNImputer(n_neighbors=4), (), AgglomerativeClustering(n_

In [5]:
'''one of twhe best clustering options tried with assosciation rules apriori, tried with 
different confidences and supports to check which is th best '''
imputer = SimpleImputer(strategy='median')
preprocessor = []
clusterer = AgglomerativeClustering(n_clusters=20)
#clusterer =KMeans(n_clusters=20)
data = Data('ml-latest-small', 
                        imputer= imputer,
                        preprocessors=preprocessor)
errors = {}
supports = [0.0007,0.0008,0.0009,0.001,0.002]
confidences = [0.7,0.8,0.9]
for supp in tqdm(supports):
    for  conf in confidences:
        recommender = ClusteringAndAprioriBasedRecommender(
        data=data.train_data_table_for_clustering_normalized,
        data_unnormalized=data.train_data_table_for_clustering,
        movie_genres=data.movie_genres,
        clusterer=clusterer,min_support = supp,metric="confidence",min_threshold=conf)
        recommender.train()
        errors[(imputer, tuple(preprocessor), clusterer,"support{}".format(supp),"confidence{}".format(conf))] = test_error(2, recommender,data,2, sample_size=100)
       
for key, value in sorted(errors.items(), key=lambda x: x[1]):
    print("average error when predicting ratings:")
    print(value, key)

100%|██████████| 20/20 [10:47<00:00, 32.40s/it]
100%|██████████| 20/20 [10:11<00:00, 30.57s/it]
100%|██████████| 20/20 [10:09<00:00, 30.47s/it]
100%|██████████| 20/20 [07:40<00:00, 23.01s/it]t]
100%|██████████| 20/20 [07:31<00:00, 22.59s/it]
100%|██████████| 20/20 [08:16<00:00, 24.82s/it]
100%|██████████| 20/20 [06:05<00:00, 18.25s/it]t]
100%|██████████| 20/20 [06:04<00:00, 18.22s/it]
100%|██████████| 20/20 [06:05<00:00, 18.28s/it]
100%|██████████| 20/20 [04:39<00:00, 13.98s/it]t]
100%|██████████| 20/20 [04:43<00:00, 14.19s/it]
100%|██████████| 20/20 [04:43<00:00, 14.18s/it]
100%|██████████| 20/20 [02:31<00:00,  7.58s/it]t]
100%|██████████| 20/20 [02:48<00:00,  8.42s/it]
100%|██████████| 20/20 [02:43<00:00,  8.18s/it]
100%|██████████| 5/5 [1:35:23<00:00, 1144.63s/it]

average error when predicting ratings:
0.6375 (SimpleImputer(strategy='median'), (), AgglomerativeClustering(n_clusters=20), 'support0.0007', 'confidence0.7')
average error when predicting ratings:
0.6375 (SimpleImputer(strategy='median'), (), AgglomerativeClustering(n_clusters=20), 'support0.0008', 'confidence0.7')
average error when predicting ratings:
0.6375 (SimpleImputer(strategy='median'), (), AgglomerativeClustering(n_clusters=20), 'support0.0009', 'confidence0.7')
average error when predicting ratings:
0.6375 (SimpleImputer(strategy='median'), (), AgglomerativeClustering(n_clusters=20), 'support0.001', 'confidence0.7')
average error when predicting ratings:
0.6375 (SimpleImputer(strategy='median'), (), AgglomerativeClustering(n_clusters=20), 'support0.002', 'confidence0.7')
average error when predicting ratings:
0.64 (SimpleImputer(strategy='median'), (), AgglomerativeClustering(n_clusters=20), 'support0.0007', 'confidence0.8')
average error when predicting ratings:
0.64 (Simpl




In [6]:
'''another of the best clustering combinations tried with assosciation rules apriori, 
tried with different confidences and supports to see which is the best '''
imputer = SimpleImputer()
preprocessor = []
clusterer = KMeans(n_clusters=20)
data = Data('ml-latest-small', 
                        imputer= imputer,
                        preprocessors=preprocessor)
errors = {}
supports = [0.0007,0.0008,0.0009,0.001,0.002]
confidences = [0.7,0.8,0.9]
for supp in tqdm(supports):
    for  conf in confidences:
        recommender = ClusteringAndAprioriBasedRecommender(
        data=data.train_data_table_for_clustering_normalized,
        data_unnormalized=data.train_data_table_for_clustering,
        movie_genres=data.movie_genres,
        clusterer=clusterer,min_support = supp,metric="confidence",min_threshold=conf)
        recommender.train()
        errors[(imputer, tuple(preprocessor), clusterer,"support{}".format(supp),"confidence{}".format(conf))] = test_error(2, recommender,data,2, sample_size=100)
        
for key, value in sorted(errors.items(), key=lambda x: x[1]):
    print("average error when predicting ratings:")
    print(value, key)

  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 20/20 [11:06<00:00, 33.31s/it]


average error when predicting ratings:


  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 20/20 [10:35<00:00, 31.76s/it]


average error when predicting ratings:


  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 20/20 [10:30<00:00, 31.53s/it]
 20%|██        | 1/5 [32:19<2:09:19, 1939.79s/it]

average error when predicting ratings:


  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 20/20 [08:12<00:00, 24.62s/it]


average error when predicting ratings:


  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 20/20 [07:40<00:00, 23.04s/it]


average error when predicting ratings:


  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 20/20 [07:40<00:00, 23.00s/it]
 40%|████      | 2/5 [55:59<1:21:42, 1634.01s/it]

average error when predicting ratings:


  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 20/20 [05:31<00:00, 16.57s/it]


average error when predicting ratings:


  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 20/20 [05:44<00:00, 17.23s/it]


average error when predicting ratings:


  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 20/20 [05:24<00:00, 16.21s/it]
 60%|██████    | 3/5 [1:12:47<44:55, 1347.88s/it]

average error when predicting ratings:


  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 20/20 [04:31<00:00, 13.57s/it]


average error when predicting ratings:


  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 20/20 [04:55<00:00, 14.79s/it]


average error when predicting ratings:


  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 20/20 [04:59<00:00, 14.99s/it]
 80%|████████  | 4/5 [1:27:20<19:20, 1160.57s/it]

average error when predicting ratings:


  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 20/20 [01:40<00:00,  5.01s/it]


average error when predicting ratings:


  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 20/20 [01:50<00:00,  5.54s/it]


average error when predicting ratings:


  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 20/20 [01:49<00:00,  5.46s/it]
100%|██████████| 5/5 [1:32:47<00:00, 1113.42s/it]

average error when predicting ratings:
0.6325000000000001 (SimpleImputer(), (), KMeans(n_clusters=20), 'support0.001', 'confidence0.9')
0.6375 (SimpleImputer(), (), KMeans(n_clusters=20), 'support0.001', 'confidence0.7')
0.6375 (SimpleImputer(), (), KMeans(n_clusters=20), 'support0.001', 'confidence0.8')
0.645 (SimpleImputer(), (), KMeans(n_clusters=20), 'support0.0007', 'confidence0.7')
0.645 (SimpleImputer(), (), KMeans(n_clusters=20), 'support0.0007', 'confidence0.8')
0.6475 (SimpleImputer(), (), KMeans(n_clusters=20), 'support0.0008', 'confidence0.7')
0.6475 (SimpleImputer(), (), KMeans(n_clusters=20), 'support0.0009', 'confidence0.9')
0.6525000000000001 (SimpleImputer(), (), KMeans(n_clusters=20), 'support0.0009', 'confidence0.7')
0.6525000000000001 (SimpleImputer(), (), KMeans(n_clusters=20), 'support0.002', 'confidence0.7')
0.655 (SimpleImputer(), (), KMeans(n_clusters=20), 'support0.0008', 'confidence0.8')
0.655 (SimpleImputer(), (), KMeans(n_clusters=20), 'support0.0009', 'con




In [3]:
'''determining the optimal number of clusters when we decided to use Kmeans() with no prcprocessor and SipleImputer after the tests 
in the previous two cells'''
imputer = SimpleImputer()
preprocessor = []
data = Data('ml-latest-small', 
                        imputer=imputer,
                        preprocessors=preprocessor)
errors = {}
for i in range(2,100,2):
            clusterer = KMeans(n_clusters=i)
            recommender = ClusteringBasedRecommender(
                data=data.train_data_table_for_clustering_normalized,
                data_unnormalized=data.train_data_table_for_clustering,
                movie_genres=data.movie_genres,
                clusterer=clusterer
                )
            recommender.train()
            errors[(imputer, tuple(preprocessor), clusterer)] = test_error(2, recommender,data,2, sample_size=100)
for key, value in sorted(errors.items(), key=lambda x: x[1]):
    print(value, key)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().

0.6325000000000001 (SimpleImputer(), (), KMeans(n_clusters=28))
0.635 (SimpleImputer(), (), KMeans(n_clusters=34))
0.635 (SimpleImputer(), (), KMeans(n_clusters=60))
0.6375 (SimpleImputer(), (), KMeans(n_clusters=32))
0.6375 (SimpleImputer(), (), KMeans(n_clusters=56))
0.64 (SimpleImputer(), (), KMeans(n_clusters=36))
0.645 (SimpleImputer(), (), KMeans(n_clusters=18))
0.645 (SimpleImputer(), (), KMeans(n_clusters=24))
0.645 (SimpleImputer(), (), KMeans(n_clusters=30))
0.645 (SimpleImputer(), (), KMeans(n_clusters=62))
0.6475 (SimpleImputer(), (), KMeans(n_clusters=20))
0.6475 (SimpleImputer(), (), KMeans(n_clusters=22))
0.6475 (SimpleImputer(), (), KMeans(n_clusters=44))
0.6475 (SimpleImputer(), (), KMeans(n_clusters=48))
0.6475 (SimpleImputer(), (), KMeans(n_clusters=58))
0.65 (SimpleImputer(), (), KMeans(n_clusters=86))
0.6525 (SimpleImputer(), (), KMeans(n_clusters=50))
0.6525000000000001 (SimpleImputer(), (), KMeans(n_clusters=92))
0.655 (SimpleImputer(), (), KMeans(n_clusters=40))

In [4]:
'''The best clustering option that we decided upon is now combined with an optimal number of clusters for it, and the Apriori algorithm'''
imputer = SimpleImputer()
preprocessor = []
clusterer = KMeans(n_clusters=28)
data = Data('ml-latest-small', 
                        imputer= imputer,
                        preprocessors=preprocessor)
errors = {}
supports = [0.0007,0.0008,0.0009,0.001,0.002]
confidences = [0.7,0.8,0.9]
for supp in tqdm(supports):
    for  conf in confidences:
        recommender = ClusteringAndAprioriBasedRecommender(
        data=data.train_data_table_for_clustering_normalized,
        data_unnormalized=data.train_data_table_for_clustering,
        movie_genres=data.movie_genres,
        clusterer=clusterer,min_support = supp,metric="confidence",min_threshold=conf)
        recommender.train()
        errors[(imputer, tuple(preprocessor), clusterer,"support{}".format(supp),"confidence{}".format(conf))] = test_error(2, recommender,data,2, sample_size=100)
        
for key, value in sorted(errors.items(), key=lambda x: x[1]):
    print("average error when predicting ratings:")
    print(value, key)

  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 28/28 [08:19<00:00, 17.86s/it]
  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 28/28 [08:49<00:00, 18.92s/it]
  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 28/28 [09:23<00:00, 20.11s/it]
  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 28/28 [06:07<00:00, 13.11s/it]
  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 28/28 [06:24<00:00, 13.73s/it]
  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 28/28 [06:00<00:00, 12.86s/it]
  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 28/28 [04:14<00:00,  9.09s/it]
  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 28/28 [04:39<00:00,  9.97s/it]
  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 28/28 [04:31<00:00,  9.71s/it]
  super()._check_params_vs_input(X, default_n_init=10)
100%|██████████| 2

average error when predicting ratings:
0.595 (SimpleImputer(), (), KMeans(n_clusters=28), 'support0.001', 'confidence0.7')
average error when predicting ratings:
0.6174999999999999 (SimpleImputer(), (), KMeans(n_clusters=28), 'support0.0009', 'confidence0.9')
average error when predicting ratings:
0.6275 (SimpleImputer(), (), KMeans(n_clusters=28), 'support0.002', 'confidence0.7')
average error when predicting ratings:
0.63 (SimpleImputer(), (), KMeans(n_clusters=28), 'support0.0009', 'confidence0.8')
average error when predicting ratings:
0.63 (SimpleImputer(), (), KMeans(n_clusters=28), 'support0.001', 'confidence0.8')
average error when predicting ratings:
0.6325000000000001 (SimpleImputer(), (), KMeans(n_clusters=28), 'support0.0008', 'confidence0.7')
average error when predicting ratings:
0.6325000000000001 (SimpleImputer(), (), KMeans(n_clusters=28), 'support0.0008', 'confidence0.8')
average error when predicting ratings:
0.6325000000000001 (SimpleImputer(), (), KMeans(n_clusters




In [4]:
'''one of three best clustering options tried with assosciation rules apriori, tried with 
different confidences and supports to check which is th best '''
imputer = SimpleImputer()
preprocessor = []
clusterer = Birch(n_clusters=10)
#clusterer =KMeans(n_clusters=20)
data = Data('ml-latest-small', 
                        imputer= imputer,
                        preprocessors=preprocessor)
errors = {}
supports = [0.0009,0.001,0.002]
confidences = [0.7,0.8,0.9]
for supp in tqdm(supports):
    for  conf in confidences:
        recommender = ClusteringAndAprioriBasedRecommender(
        data=data.train_data_table_for_clustering_normalized,
        data_unnormalized=data.train_data_table_for_clustering,
        movie_genres=data.movie_genres,
        clusterer=clusterer,min_support = supp,metric="confidence",min_threshold=conf)
        recommender.train()
        errors[(imputer, tuple(preprocessor), clusterer,"support{}".format(supp),"confidence{}".format(conf))] = test_error(2, recommender,data,2, sample_size=100)
       
for key, value in sorted(errors.items(), key=lambda x: x[1]):
    print("average error when predicting ratings:")
    print(value, key)

100%|██████████| 10/10 [08:12<00:00, 49.22s/it]
100%|██████████| 10/10 [08:14<00:00, 49.42s/it]
100%|██████████| 10/10 [07:55<00:00, 47.56s/it]
100%|██████████| 10/10 [05:59<00:00, 35.99s/it]
100%|██████████| 10/10 [06:01<00:00, 36.18s/it]
100%|██████████| 10/10 [06:05<00:00, 36.53s/it]
100%|██████████| 10/10 [01:45<00:00, 10.51s/it]
100%|██████████| 10/10 [01:50<00:00, 11.09s/it]
100%|██████████| 10/10 [01:50<00:00, 11.08s/it]
100%|██████████| 3/3 [48:06<00:00, 962.09s/it] 

average error when predicting ratings:
0.65 (SimpleImputer(), (), Birch(n_clusters=10), 'support0.0009', 'confidence0.7')
average error when predicting ratings:
0.65 (SimpleImputer(), (), Birch(n_clusters=10), 'support0.001', 'confidence0.7')
average error when predicting ratings:
0.6525000000000001 (SimpleImputer(), (), Birch(n_clusters=10), 'support0.0009', 'confidence0.8')
average error when predicting ratings:
0.6525000000000001 (SimpleImputer(), (), Birch(n_clusters=10), 'support0.0009', 'confidence0.9')
average error when predicting ratings:
0.6525000000000001 (SimpleImputer(), (), Birch(n_clusters=10), 'support0.001', 'confidence0.8')
average error when predicting ratings:
0.6525000000000001 (SimpleImputer(), (), Birch(n_clusters=10), 'support0.001', 'confidence0.9')
average error when predicting ratings:
0.655 (SimpleImputer(), (), Birch(n_clusters=10), 'support0.002', 'confidence0.7')
average error when predicting ratings:
0.6575 (SimpleImputer(), (), Birch(n_clusters=10), 'su


