**Задание по теме «Коллаборативная фильтрация»**

**Преподаватели:** Наталья Баданина, Юлия Пономарева, Егор Шишковец \
Пакет SURPRISE:


*   используйте данные MovieLens 1M,
можно использовать любые модели из пакета,
*   получите RMSE на тестовом сете 0,87 и ниже.

Комментарий преподавателя: \
В домашнем задании на датасет 1М может не хватить RAM. Можно сделать на 100K. Качество RMSE предлагаю считать на основе Cross-validation (5 фолдов), а не на отложенном датасете.

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from scipy.spatial.distance import cityblock, cosine, euclidean, hamming, jaccard

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [4]:
movies_with_ratings = movies.merge(ratings, on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [5]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [6]:
num_users = movies_with_ratings.userId.unique().shape[0]
num_users

610

In [7]:
# Поскольку фильмов гораздо больше, чем пользователей, лучше взять user-based алгоритм

In [None]:
!pip install surprise

In [9]:
from surprise import KNNWithMeans, KNNBaseline
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV

In [10]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

dataset.head()

Unnamed: 0,uid,iid,rating
0,1,Toy Story (1995),4.0
1,5,Toy Story (1995),4.0
2,7,Toy Story (1995),4.5
3,15,Toy Story (1995),2.5
4,17,Toy Story (1995),4.5


In [11]:
ratings.rating.min(), ratings.rating.max()

(0.5, 5.0)

In [12]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [13]:
# KNN centered

In [None]:
param_grid = {'k': [10, 30, 50],
              'sim_options'[0]: ['cityblock', 'cosine', 'jaccard',
                                 'pearson_baseline'],
              'sim_options'[1]: [False, True]
              }
gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=5)
gs.fit(data)

KNNcentered_score = gs.best_score['rmse']
KNNcentered_params = gs.best_params['rmse']

print(KNNcentered_score)
print(KNNcentered_params)

In [19]:
# удивительно, что в итоге лучшим оказался item-based

In [15]:
# KNN Baseline

In [None]:
param_grid = {'k': [20, 40, 70],
              'min_k': [0,1],
              'sim_options'[1]: [False],
              'bsl_options'[0]: ['als'],
              'bsl_options'[1]: [5, 10, 20],
              'bsl_options'[2]: [10, 12, 15],
              'bsl_options'[3]: [5, 10]
              }

# 'bsl_options': {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}

gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse'], cv=5)
gs.fit(data)

KNNBaseline_score = gs.best_score['rmse']
KNNBaseline_params = gs.best_params['rmse']

print(KNNBaseline_score)
print(KNNBaseline_params)

In [32]:
# Co-Clustering

In [17]:
from surprise.prediction_algorithms.co_clustering import CoClustering

param_grid = {'n_cltr_u': range(1,4),
              'n_cltr_i': range(1,4),
              'n_epochs': [5, 10, 20, 30],
              'random_state': [42],
              }
gs = GridSearchCV(CoClustering, param_grid, measures=['rmse'], cv=5,
                  joblib_verbose=2)
gs.fit(data)

CoClustering_score = gs.best_score['rmse']
CoClustering_params = gs.best_params['rmse']

print(CoClustering_score)
print(CoClustering_params)

[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:  1.5min
[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:  6.2min


0.9361866344150256
{'n_cltr_u': 1, 'n_cltr_i': 1, 'n_epochs': 5, 'random_state': 42}


In [18]:
# SVD

In [20]:
from surprise.prediction_algorithms.matrix_factorization import SVD

param_grid = {'n_factors': [100, 200, 300, 400],
              'n_epochs': [10, 30, 40, 60],
              'lr_all': [0.001, 0.005, 0.01, 0.05],
              'reg_all': [0.02],
              'random_state': [42]
              }
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)
gs.fit(data)

SVD_score = gs.best_score['rmse']
SVD_params = gs.best_params['rmse']

print(SVD_score)
print(SVD_params)

0.8584774552672272
{'n_factors': 400, 'n_epochs': 30, 'lr_all': 0.05, 'reg_all': 0.02, 'random_state': 42}


In [31]:
# SVDpp

In [21]:
from surprise.prediction_algorithms.matrix_factorization import SVDpp

param_grid = {'n_factors': [10, 20, 50],
              'n_epochs': [10, 20],
              'cache_ratings': [True],
              'random_state': [42]
              }

gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=5)
gs.fit(data)

SVDpp_score = gs.best_score['rmse']
SVDpp_params = gs.best_params['rmse']

print(SVDpp_score)
print(SVDpp_params)

0.8609152434461812
{'n_factors': 10, 'n_epochs': 20, 'cache_ratings': True, 'random_state': 42}


In [None]:
# можно было бы попробовать бОльшее количество параметров и скор был бы получше,
# но время выполнения этого алгоритма очень большое

In [33]:
# NMF

In [23]:
from surprise.prediction_algorithms.matrix_factorization import NMF

param_grid = {'n_factors': [10, 15, 50],
              'n_epochs': [30, 50, 70],
              'random_state': [42]
              }

gs = GridSearchCV(NMF, param_grid, measures=['rmse'], cv=5)
gs.fit(data)

NMF_score = gs.best_score['rmse']
NMF_params = gs.best_params['rmse']

print(NMF_score)
print(NMF_params)

0.9033039878948891
{'n_factors': 50, 'n_epochs': 70, 'random_state': 42}


In [24]:
data_compare = {'method': ['KNNcentered', 'KNNBaseline','CoClustering', 'SVD',
                           'SVDpp', 'NMF'],
                'score': [round(KNNcentered_score, 3), round(KNNBaseline_score,3),
                          round(CoClustering_score, 3), round(SVD_score, 3),
                          round(SVDpp_score, 3), round(NMF_score, 3)],
                'params': [KNNcentered_params, KNNBaseline_params,
                           CoClustering_params, SVDpp_params, SVD_params, NMF_params]
                }

compare_methods = pd.DataFrame(data_compare)
compare_methods

Unnamed: 0,method,score,params
0,KNNcentered,0.897,"{'k': 50, 's': 'cityblock', 'i': False}"
1,KNNBaseline,0.875,"{'k': 40, 'min_k': 0, 'i': False, 'b': 'als', ..."
2,CoClustering,0.936,"{'n_cltr_u': 1, 'n_cltr_i': 1, 'n_epochs': 5, ..."
3,SVD,0.858,"{'n_factors': 10, 'n_epochs': 20, 'cache_ratin..."
4,SVDpp,0.861,"{'n_factors': 400, 'n_epochs': 30, 'lr_all': 0..."
5,NMF,0.903,"{'n_factors': 50, 'n_epochs': 70, 'random_stat..."


In [30]:
# лучший скор у SVD.

In [25]:
# создадим рекомендации на основе наиболее оптимального алгоритма

In [26]:
algo = SVD(n_factors= 400, n_epochs= 30, lr_all= 0.05, reg_all= 0.02,
           random_state = 42)

cv = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)
cv['test_rmse'].mean()

0.8592612746062949

In [27]:
def generate_recommendation(uid, model, dataset, thresh=4, amount=5):
    all_titles = list(dataset['iid'].values)
    users_seen_titles = dataset[dataset['uid'] == uid]['iid']
    titles = np.array(list(set(all_titles) - set(users_seen_titles)))

    np.random.shuffle(titles)

    rec_list = []
    for title in titles:
        review_prediction = model.predict(uid=uid, iid=title)
        rating = review_prediction.est

        if rating >= thresh:
            rec_list.append((title, round(rating, 2)))

            if len(rec_list) == amount:
                return rec_list

In [29]:
generate_recommendation(5, algo, dataset, thresh=4.5, amount = 10)

[('Man Who Shot Liberty Valance, The (1962)', 4.52),
 ('Paths of Glory (1957)', 4.51),
 ('Godfather, The (1972)', 4.62),
 ('Citizen Kane (1941)', 4.59),
 ('Clockwork Orange, A (1971)', 4.61),
 ('Amadeus (1984)', 4.53),
 ('Hoop Dreams (1994)', 4.74),
 ('Casablanca (1942)', 4.55),
 ('Spider-Man 2 (2004)', 4.54),
 ('Psycho (1960)', 4.73)]

Статья с таблицей с метриками качества и временем выполнения каждого алгоритма: \
https://laptopprocessors.ru/slope-one-python-surprise/?ysclid=ln92se2jy5103278678