### Hybrid recommender systems

#### Получаем данные

In [1]:
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np

In [2]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from surprise import SVD, BaselineOnly, NMF, SlopeOne, KNNWithMeans, NormalPredictor, KNNBaseline, KNNWithZScore, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split, KFold

In [4]:
!wget 'https://drive.google.com/uc?id=1m0rwReR09achL0xTM6QPoN4tykz5bOMx' -O MovieLens.zip
!unzip MovieLens.zip

--2023-05-28 20:50:04--  https://drive.google.com/uc?id=1m0rwReR09achL0xTM6QPoN4tykz5bOMx
Resolving drive.google.com (drive.google.com)... 173.194.216.139, 173.194.216.138, 173.194.216.100, ...
Connecting to drive.google.com (drive.google.com)|173.194.216.139|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0g-84-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/elc7j0rqmbv9n5cmv6r8qii4m7qqfntd/1685307000000/02611596255248067438/*/1m0rwReR09achL0xTM6QPoN4tykz5bOMx?uuid=aaa6b78a-c189-4eae-92f1-f7c74a0e1f2d [following]
--2023-05-28 20:50:05--  https://doc-0g-84-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/elc7j0rqmbv9n5cmv6r8qii4m7qqfntd/1685307000000/02611596255248067438/*/1m0rwReR09achL0xTM6QPoN4tykz5bOMx?uuid=aaa6b78a-c189-4eae-92f1-f7c74a0e1f2d
Resolving doc-0g-84-docs.googleusercontent.com (doc-0g-84-docs.googleusercontent.com)... 142.250.97.132, 2607:f8b0:400c:c18::84
Connecting to do

In [5]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [6]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [7]:
movies_with_ratings

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,9.649827e+08
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,8.474350e+08
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1.106636e+09
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1.510578e+09
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1.305696e+09
...,...,...,...,...,...,...
100849,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184.0,4.0,1.537109e+09
100850,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184.0,3.5,1.537110e+09
100851,193585,Flint (2017),Drama,184.0,3.5,1.537110e+09
100852,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184.0,3.5,1.537110e+09


In [8]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [9]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [10]:
trainset, testset = train_test_split(data, test_size=.2, random_state=42)

#### Подготавливаем разные алгоритмы

In [11]:
SVD_algo = SVD(n_epochs=50, reg_all=0.05)
SVD_algo.fit(trainset)
test_pred = SVD_algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8599


0.8598987521246207

In [12]:
BaselineOnly_algo = BaselineOnly(bsl_options = {"method": "als", "n_epochs": 100, "reg_u": 12, "reg_i": 2})
BaselineOnly_algo.fit(trainset)
test_pred = BaselineOnly_algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

Estimating biases using als...
RMSE: 0.8618


0.861799679683496

In [13]:
KNNBaseline_algo = KNNBaseline(min_k=10, bsl_options = {"method": "als", "n_epochs": 50, "reg_u": 12, "reg_i": 2})
KNNBaseline_algo.fit(trainset)
test_pred = KNNBaseline_algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8568


0.8567623802139614

In [14]:
SlopeOne_algo = SlopeOne()
SlopeOne_algo.fit(trainset)
test_pred = SlopeOne_algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8946


0.8945783911352443

#### Комбинирование гибридная рекомендательная система

In [15]:
# Комбинирование
def recommend_for_user(user_id, first_algo, second_algo):
  user_non_watched_movies = movies_with_ratings[movies_with_ratings.userId != user_id]
  dataset = pd.DataFrame({
    'uid': user_non_watched_movies.userId,
    'iid': user_non_watched_movies.title,
    'rating': user_non_watched_movies.rating
  })
  data = Dataset.load_from_df(dataset, Reader(rating_scale=(0.5, 5.0))).build_full_trainset()
  data = [(x.uid, x.iid, x.rating) for x in list(dataset.itertuples(index=False))]
  first_algo_scores, second_algo_scores = [], []

  first_algo_predictions = first_algo.test(data)

  for prediction in first_algo_predictions:
    first_algo_scores.append((prediction.uid, prediction.iid, prediction.est))
  
  first_algo_scores.sort(key=lambda elem: elem[2], reverse=True)

  # btw it's less than 100, got only unique titles
  top_100_by_first_algo = []
  for score in first_algo_scores[:100]:
    if score[1] not in [x[1] for x in top_100_by_first_algo]:
      top_100_by_first_algo.append(score)
  
  for uid, iid, _ in top_100_by_first_algo:
    second_algo_scores.append((uid, iid, second_algo.predict(uid=uid, iid=iid).est))
  
  second_algo_scores.sort(key=lambda elem: elem[2], reverse=True)
  top_10_by_second_algo = second_algo_scores[:10] 

  return [x for x in top_10_by_second_algo]

In [16]:
recommend_for_user(2.0, SVD_algo, KNNBaseline_algo)

[(122.0, 'Usual Suspects, The (1995)', 5.0),
 (93.0, 'Braveheart (1995)', 5.0),
 (53.0, 'Immortal Beloved (1994)', 5.0),
 (122.0, 'Pulp Fiction (1994)', 5.0),
 (30.0, 'Shawshank Redemption, The (1994)', 5.0),
 (251.0, 'Forrest Gump (1994)', 5.0),
 (93.0, "Schindler's List (1993)", 5.0),
 (122.0, 'Fargo (1996)', 5.0),
 (171.0, 'Wallace & Gromit: The Best of Aardman Animation (1996)', 5.0),
 (53.0, 'Roman Holiday (1953)', 5.0)]

In [17]:
recommend_for_user(2.0, KNNBaseline_algo, SVD_algo)

[(53.0, 'Immortal Beloved (1994)', 5.0),
 (30.0, 'Star Wars: Episode IV - A New Hope (1977)', 5.0),
 (30.0, 'Shawshank Redemption, The (1994)', 5.0),
 (171.0, 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)', 4.911756612888624),
 (122.0, 'Taxi Driver (1976)', 4.9104818914806225),
 (43.0,
  'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
  4.855755257357367),
 (35.0, 'Usual Suspects, The (1995)', 4.855494664621993),
 (171.0, 'Seven (a.k.a. Se7en) (1995)', 4.8449825159286775),
 (523.0,
  'Léon: The Professional (a.k.a. The Professional) (Léon) (1994)',
  4.840556198051412),
 (43.0, 'Toy Story (1995)', 4.836428551999437)]

In [18]:
recommend_for_user(2.0, SVD_algo, BaselineOnly_algo)

[(122.0, 'Usual Suspects, The (1995)', 5.0),
 (122.0, 'Pulp Fiction (1994)', 5.0),
 (53.0, 'Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)', 5.0),
 (122.0, 'Reservoir Dogs (1992)', 5.0),
 (171.0, 'Streetcar Named Desire, A (1951)', 5.0),
 (122.0, 'Monty Python and the Holy Grail (1975)', 5.0),
 (122.0, 'Goodfellas (1990)', 5.0),
 (452.0, 'Psycho (1960)', 5.0),
 (452.0, 'Shining, The (1980)', 5.0),
 (122.0, 'Cool Hand Luke (1967)', 5.0)]

In [19]:
recommend_for_user(2.0, KNNBaseline_algo, BaselineOnly_algo)

[(43.0,
  'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
  5.0),
 (43.0, 'Toy Story (1995)', 4.926523158995917),
 (30.0, 'Shawshank Redemption, The (1994)', 4.925146249265773),
 (122.0, 'Taxi Driver (1976)', 4.886456931835486),
 (523.0,
  'Léon: The Professional (a.k.a. The Professional) (Léon) (1994)',
  4.8199198384941955),
 (53.0, 'Immortal Beloved (1994)', 4.79590851835073),
 (171.0, 'Seven (a.k.a. Se7en) (1995)', 4.775368184459195),
 (30.0, 'Star Wars: Episode IV - A New Hope (1977)', 4.760759136308771),
 (171.0, 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)', 4.753316435635924),
 (35.0, 'Usual Suspects, The (1995)', 4.652094067462009)]

In [20]:
recommend_for_user(2.0, KNNBaseline_algo, SlopeOne_algo)

[(43.0, 'Toy Story (1995)', 5.0),
 (43.0,
  'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
  5.0),
 (53.0, 'Immortal Beloved (1994)', 5.0),
 (30.0, 'Star Wars: Episode IV - A New Hope (1977)', 5.0),
 (523.0,
  'Léon: The Professional (a.k.a. The Professional) (Léon) (1994)',
  5.0),
 (30.0, 'Shawshank Redemption, The (1994)', 5.0),
 (171.0, 'Seven (a.k.a. Se7en) (1995)', 4.973670287958682),
 (171.0, 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)', 4.931574304065654),
 (122.0, 'Taxi Driver (1976)', 4.923592725948297),
 (35.0, 'Usual Suspects, The (1995)', 4.790296860723791)]

In [21]:
recommend_for_user(2.0, SlopeOne_algo, BaselineOnly_algo)

[(43.0,
  'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
  5.0),
 (43.0, 'Seven (a.k.a. Se7en) (1995)', 5.0),
 (43.0, 'Braveheart (1995)', 5.0),
 (43.0, 'Forrest Gump (1994)', 5.0),
 (43.0, 'Fugitive, The (1993)', 5.0),
 (1.0, 'Usual Suspects, The (1995)', 4.9983716679259835),
 (43.0, 'Lion King, The (1994)', 4.965598417556597),
 (1.0, 'Pulp Fiction (1994)', 4.958474918810048),
 (43.0, 'Toy Story (1995)', 4.926523158995917),
 (30.0, 'Shawshank Redemption, The (1994)', 4.925146249265773)]

#### Смешанная гибридная рекомендательная система

In [22]:
# Смешивание
def recommend_for_user_v2(user_id, first_algo, second_algo):
  user_non_watched_movies = movies_with_ratings[movies_with_ratings.userId != user_id]
  dataset = pd.DataFrame({
    'uid': user_non_watched_movies.userId,
    'iid': user_non_watched_movies.title,
    'rating': user_non_watched_movies.rating
  })
  data = Dataset.load_from_df(dataset, Reader(rating_scale=(0.5, 5.0))).build_full_trainset()
  data = [(x.uid, x.iid, x.rating) for x in list(dataset.itertuples(index=False))]
  first_algo_scores, second_algo_scores = [], []

  for prediction in first_algo.test(data):
    first_algo_scores.append((prediction.uid, prediction.iid, prediction.est))
  
  first_algo_scores.sort(key=lambda elem: elem[2], reverse=True)

  for prediction in second_algo.test(data):
    second_algo_scores.append((prediction.uid, prediction.iid, prediction.est))
  
  second_algo_scores.sort(key=lambda elem: elem[2], reverse=True)

  avg_score = []
  i, j = 0, 0
  while(len(avg_score) < 10):
    if (first_algo_scores[i][2] > second_algo_scores[j][2]):
      i += 1
      if (first_algo_scores[i][1] not in [x[1] for x in avg_score]):
        avg_score.append(first_algo_scores[i])
    else:
      j += 1
      if (second_algo_scores[j][1] not in [x[1] for x in avg_score]):
        avg_score.append(second_algo_scores[j])

  return avg_score

In [23]:
recommend_for_user_v2(2.0, KNNBaseline_algo, BaselineOnly_algo)

[(43.0, 'Seven (a.k.a. Se7en) (1995)', 5.0),
 (122.0, 'Usual Suspects, The (1995)', 5.0),
 (43.0, 'Braveheart (1995)', 5.0),
 (122.0, 'Star Wars: Episode IV - A New Hope (1977)', 5.0),
 (43.0, 'Pulp Fiction (1994)', 5.0),
 (43.0, 'Shawshank Redemption, The (1994)', 5.0),
 (43.0, 'Forrest Gump (1994)', 5.0),
 (43.0, 'Fugitive, The (1993)', 5.0),
 (452.0, "Schindler's List (1993)", 5.0),
 (452.0, 'Blade Runner (1982)', 5.0)]

In [24]:
recommend_for_user_v2(2.0, SVD_algo, KNNBaseline_algo)

[(43.0,
  'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
  5.0),
 (171.0, 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)', 5.0),
 (171.0, 'Seven (a.k.a. Se7en) (1995)', 5.0),
 (35.0, 'Usual Suspects, The (1995)', 5.0),
 (30.0, 'Braveheart (1995)', 5.0),
 (122.0, 'Taxi Driver (1976)', 5.0),
 (53.0, 'Immortal Beloved (1994)', 5.0),
 (30.0, 'Star Wars: Episode IV - A New Hope (1977)', 5.0),
 (523.0,
  'Léon: The Professional (a.k.a. The Professional) (Léon) (1994)',
  5.0),
 (37.0, 'Pulp Fiction (1994)', 5.0)]

In [25]:
recommend_for_user_v2(2.0, SlopeOne_algo, BaselineOnly_algo)

[(43.0, 'Seven (a.k.a. Se7en) (1995)', 5.0),
 (122.0, 'Usual Suspects, The (1995)', 5.0),
 (43.0, 'Braveheart (1995)', 5.0),
 (122.0, 'Star Wars: Episode IV - A New Hope (1977)', 5.0),
 (43.0, 'Pulp Fiction (1994)', 5.0),
 (43.0, 'Shawshank Redemption, The (1994)', 5.0),
 (43.0, 'Forrest Gump (1994)', 5.0),
 (43.0, 'Fugitive, The (1993)', 5.0),
 (452.0, "Schindler's List (1993)", 5.0),
 (452.0, 'Blade Runner (1982)', 5.0)]

### Выводы

Попробовал на практике применение гибридных рекомендательных систем.

В ходе работы обучил несколько алгоритмов (SVD, KNNBaseline, BaselineOnly, SlopeOne) и применил их в двух видах гибридных систем. Смешаная и комбинированная.

Мне кажется что комбинированная гибридная система выглядит наиболее надежной т.к. некоторые алгоритмы могут по ошибке предсказать у всех оценку 5 и тогда в алгоритме смешивании у нас будут все фильмы из этого алгоритма, хотя это может быть ошибочно.