**1 Подгрузим данные и необходимые библиотеки**

In [None]:
# библиотека для скачивания данных с гугл.диска
!pip install --upgrade --no-cache-dir gdown



In [None]:
# библиотека для векторизации текста, обновим до актуальной версии
!pip install -U gensim



In [None]:
import gdown
import pandas as pd
import numpy as np
import random
import string
from tqdm import tqdm_notebook as tqdm
import re
from scipy import spatial
import heapq
import timeit
from ast import literal_eval

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from gensim.models import Word2Vec

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
pd.set_option('display.max_columns', None)

Файлы были взяты из этой задачки: kaggle.com/datasets/rounakbanik/the-movies-dataset

In [None]:
!gdown 1Co7iQWJhJdqNFTlChd-xT1b88VAzuX4f
!gdown 14OAYTUIw9BoCCSTFIiYzgPq-1XyP0dhQ
!gdown 1hrC11y_r0veFdZqkwH1MSUxSv_CwJtyC

Downloading...
From: https://drive.google.com/uc?id=1Co7iQWJhJdqNFTlChd-xT1b88VAzuX4f
To: /content/movies_metadata.csv
100% 34.4M/34.4M [00:00<00:00, 106MB/s] 
Downloading...
From: https://drive.google.com/uc?id=14OAYTUIw9BoCCSTFIiYzgPq-1XyP0dhQ
To: /content/ratings.csv
100% 710M/710M [00:09<00:00, 72.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1hrC11y_r0veFdZqkwH1MSUxSv_CwJtyC
To: /content/credits.csv
100% 190M/190M [00:03<00:00, 48.7MB/s]


**2 Проведем первичную обработку данных**

В первом датасете, которым мы воспользуемся, содержится вся основная информация о фильмах, например: бюджет, жанровая принадлежность, язык оригинала, название, описание, дата создания и тд.

In [None]:
movies_metadata = pd.read_csv('movies_metadata.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
movies_metadata.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [None]:
# удалим ошибочные значения
movies_metadata.drop(movies_metadata[movies_metadata['id'] == '1997-08-20'].index, inplace=True)

In [None]:
movies_metadata.drop(movies_metadata[movies_metadata['id'] == '2012-09-29'].index, inplace=True)

In [None]:
movies_metadata.drop(movies_metadata[movies_metadata['id'] == '2014-01-01'].index, inplace=True)

В этом наборе мы сосредоточимся на текстовых данных и  будем работать только со стобцами original_title (название фильма) и overview (официальное описание фильма)

In [None]:
# оставим только колонки, которые мы собираемся использовать
movies_metadata = movies_metadata[['id', 'title', 'overview', 'genres', 'production_companies']]

In [None]:
# заполняем пустые значения
movies_metadata = movies_metadata.fillna('')

In [None]:
# удалим дубли, потому что они есть и очень мешаются
movies_metadata = movies_metadata.drop_duplicates().reset_index(drop=True)

In [None]:
movies_metadata = movies_metadata.rename(columns={'id':'movieId'})

In [None]:
# меняем тип данных колонки, чтобы проще было джойниться с другой таблицей
movies_metadata['movieId'] = movies_metadata['movieId'].astype(int)

Во втором наборе содержится информация о съемочной команде, нас будет интересовать имя режисера

In [None]:
credits = pd.read_csv('credits.csv')[['crew', 'id']]

In [None]:
credits = credits.rename(columns={'id':'movieId'})

In [None]:
credits = credits.drop_duplicates().reset_index(drop=True)

In [None]:
credits.head(5)

Unnamed: 0,crew,movieId
0,"[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


Теперь поработаем с еще одним набором - пользовательскими оценками фильмов

In [None]:
ratings = pd.read_csv('ratings.csv')

# выберем только достаточно высоко оцененные фильмы
ratings = ratings[ratings['rating'] > 3]

После подгрузки всех необходимых данных найдем их пересечение по айди фильмов

Дело в том, что не для всех фильмов есть данные о просмотрах и оценках, а для некоторых оценок нет данных о самом фильме

In [None]:
common_ids = set(movies_metadata.movieId).intersection(set(ratings.movieId))

ratings = ratings[ratings.movieId.isin(common_ids)].reset_index(drop=True)
movies_metadata = movies_metadata[movies_metadata.movieId.isin(common_ids)].reset_index(drop=True)

In [None]:
movies_metadata.shape[0]

6835

In [None]:
movies_metadata.groupby('movieId').count().sort_values(by='genres', ascending=False).head(4)

Unnamed: 0_level_0,title,overview,genres,production_companies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,1,1,1,1
65300,1,1,1,1
65216,1,1,1,1
65188,1,1,1,1


In [None]:
credits = credits[credits.movieId.isin(common_ids)].reset_index(drop=True)

In [None]:
# здесь мы видим, что размерность не совпадает - есть подозрительные записи
credits.shape[0]

6837

In [None]:
# ищем задвоенные, но не идентичные записи
credits.groupby('movieId').count().sort_values(by='crew', ascending=False).head(4)

Unnamed: 0_level_0,crew
movieId,Unnamed: 1_level_1
159849,2
4912,2
64650,1
65216,1


In [None]:
credits[credits['movieId'] == 159849]

Unnamed: 0,crew,movieId
3181,"[{'credit_id': '52fe4c229251416c910f10d5', 'de...",159849
6056,"[{'credit_id': '52fe4c229251416c910f10d5', 'de...",159849


In [None]:
credits.drop(credits[credits.index == 3181].index, inplace=True)

In [None]:
credits[credits['movieId'] == 4912]

Unnamed: 0,crew,movieId
1525,"[{'credit_id': '52fe43e2c3a36847f80760b5', 'de...",4912
5856,"[{'credit_id': '52fe43e2c3a36847f80760a9', 'de...",4912


In [None]:
credits.drop(credits[credits.index == 5856].index, inplace=True)

In [None]:
# убеждаемся, что получили нужный размер
credits.shape[0]

6835

In [None]:
# соединяем с основным набором
movies_metadata = pd.merge(movies_metadata, credits, on='movieId')

In [None]:
movies_metadata.head(4) 

Unnamed: 0,movieId,title,overview,genres,production_companies,crew
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","[{'name': 'Regency Enterprises', 'id': 508}, {...","[{'credit_id': '52fe4292c3a36847f802916d', 'de..."
3,710,GoldenEye,James Bond must unmask the mysterious head of ...,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...","[{'name': 'United Artists', 'id': 60}, {'name'...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de..."


Cтолбцы genres, production_companies и crew имеют сложную структуру, но по факту их тип - строка, надо это исправить, чтобы достать из них необходимую нам информацию

In [None]:
type(movies_metadata['genres'][0])

str

In [None]:
# literal_eval позволяет восстановить исходную структуру признака из строки
movies_metadata['crew'] = movies_metadata['crew'].apply(literal_eval)
movies_metadata['genres'] = movies_metadata['genres'].apply(literal_eval)
movies_metadata['production_companies'] = movies_metadata['production_companies'].apply(literal_eval)

In [None]:
# получилось
type(movies_metadata['genres'][0])

list

Напишем и применим функции, достающие имена режисеров, названия студий и жанры фильмов

In [None]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [None]:
def get_genres_company(x):
    for i in x:
        return i['name']
    return np.nan

In [None]:
movies_metadata['director'] = movies_metadata['crew'].apply(get_director)
movies_metadata['genres'] = movies_metadata['genres'].apply(get_genres_company)
movies_metadata['production_company'] = movies_metadata['production_companies'].apply(get_genres_company)

In [None]:
movies_metadata = movies_metadata[['movieId', 'title', 'overview', 'director', 'genres', 'production_company']]

In [None]:
movies_metadata.head(4)

Unnamed: 0,movieId,title,overview,director,genres,production_company
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",John Lasseter,Animation,Pixar Animation Studios
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Joe Johnston,Adventure,TriStar Pictures
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...",Michael Mann,Action,Regency Enterprises
3,710,GoldenEye,James Bond must unmask the mysterious head of ...,Martin Campbell,Adventure,United Artists


**3 С помощью обработки текста получим матрицу эмбедингов для каждого фильма**

Нам нужно перевести текстовые данные из названия и описания фильмов в формат, которым может оперировать компьютер - вектора эмбедингов. Перед векторизацией нужно предобработать данные и токенизировать их

После этого мы сможем выполнять операции над этими данными, например сравнивать между собой и находить расстояние между двумя фильмами

In [None]:
def preprocessing(data, lemmatizer = WordNetLemmatizer(), 
                  stop_words = stopwords.words('english')):
    
    '''
       на вход: 
       - data - датафрейм с описанием и названием фильмов
       - lemmatizer - класс для лемматизации текстов
       - stop_words - стоп-слова
      
       на выходе: 
       - список с предобработанными текстовыми данными
    '''
    
    texts = data.values.tolist()
    new_text = []

    for i in tqdm(range(len(texts))):
        text = texts[i]

        # приводим к нижнему регистру и дропаем стоп-слова
        punct_free = re.sub('[^a-zA-Z]', ' ', str(text).lower())
        punct_free = re.sub(r'\s+', ' ', punct_free)
        stop_free = " ".join([i for i in punct_free.split() if i not in stop_words])
        lemma_text = ' '.join([lemmatizer.lemmatize(word) for word in stop_free.split(' ')])
        new_text.append(lemma_text)
    
    return new_text

In [None]:
nltk.download("stopwords")

text_overview = preprocessing(movies_metadata["overview"])
text_title = preprocessing(movies_metadata["title"])
text_dir = preprocessing(movies_metadata["director"])
text_genres = preprocessing(movies_metadata["genres"])
text_company = preprocessing(movies_metadata["production_company"])

movies_metadata["overview"] = text_overview
movies_metadata["title"] = text_title 
movies_metadata["director"] = text_dir
movies_metadata["genres"] = text_genres
movies_metadata["production_company"] = text_company

# закидываем предобработанные описания и названия фильмов в общий список
text_prepr = text_overview + text_title + text_dir + text_genres + text_company

# и разбиваем на отдельные слова
text_prepr = [i.split() for i in text_prepr]
text_overview = [i.split() for i in text_overview]
text_title = [i.split() for i in text_title]
text_dir = [i.split() for i in text_dir]
text_genres = [i.split() for i in text_genres]
text_company = [i.split() for i in text_company]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/6835 [00:00<?, ?it/s]

  0%|          | 0/6835 [00:00<?, ?it/s]

  0%|          | 0/6835 [00:00<?, ?it/s]

  0%|          | 0/6835 [00:00<?, ?it/s]

  0%|          | 0/6835 [00:00<?, ?it/s]

In [None]:
# с помощью word2vec получаем векторизованный словарь
model = Word2Vec(text_prepr, 
                 min_count=2)

model.build_vocab(text_prepr)
words = model.wv.index_to_key
vocab_size = len(words)

print("Vocab size =", vocab_size)

Vocab size = 16575


После обработки всех текстовых данных перейдем к созданию матрицы эмбедингов для каждого фильма

In [None]:
def embedding_matrix(text, model):
    
    """
       На вход: словарь tokenizer.word_index 
       модель word2vec
        
       На выходе: матрица эмбедингов текста
    """
    
    embedding_matrix = np.zeros((id_film, 100))

    for sentence in text:
        for word in sentence:
            if word in model:
                embedding_matrix[text.index(sentence)] = model[word]

    return embedding_matrix

In [None]:
id_film = movies_metadata['movieId'].nunique()

# получим матрицы с эмбедингами для всех текстовых признаков
matrix_emb_over = embedding_matrix(text_overview, model.wv)
matrix_emb_title = embedding_matrix(text_title, model.wv)
matrix_emb_dir = embedding_matrix(text_dir, model.wv)
matrix_emb_genres = embedding_matrix(text_genres, model.wv)
matrix_emb_company = embedding_matrix(text_company, model.wv)

# и соединим их
matrix_emb = pd.concat([movies_metadata[['movieId']],
                        pd.DataFrame(matrix_emb_over),
                        pd.DataFrame(matrix_emb_title),
                        pd.DataFrame(matrix_emb_genres)], axis=1)

In [None]:
matrix_emb.shape

(6835, 301)

In [None]:
matrix_emb.head(4)

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,9.1,10.1,11.1,12.1,13.1,14.1,15.1,16.1,17.1,18.1,19.1,20.1,21.1,22.1,23.1,24.1,25.1,26.1,27.1,28.1,29.1,30.1,31.1,32.1,33.1,34.1,35.1,36.1,37.1,38.1,39.1,40.1,41.1,42.1,43.1,44.1,45.1,46.1,47.1,48.1,49.1,50.1,51.1,52.1,53.1,54.1,55.1,56.1,57.1,58.1,59.1,60.1,61.1,62.1,63.1,64.1,65.1,66.1,67.1,68.1,69.1,70.1,71.1,72.1,73.1,74.1,75.1,76.1,77.1,78.1,79.1,80.1,81.1,82.1,83.1,84.1,85.1,86.1,87.1,88.1,89.1,90.1,91.1,92.1,93.1,94.1,95.1,96.1,97.1,98.1,99.1,0.2,1.2,2.2,3.2,4.2,5.2,6.2,7.2,8.2,9.2,10.2,11.2,12.2,13.2,14.2,15.2,16.2,17.2,18.2,19.2,20.2,21.2,22.2,23.2,24.2,25.2,26.2,27.2,28.2,29.2,30.2,31.2,32.2,33.2,34.2,35.2,36.2,37.2,38.2,39.2,40.2,41.2,42.2,43.2,44.2,45.2,46.2,47.2,48.2,49.2,50.2,51.2,52.2,53.2,54.2,55.2,56.2,57.2,58.2,59.2,60.2,61.2,62.2,63.2,64.2,65.2,66.2,67.2,68.2,69.2,70.2,71.2,72.2,73.2,74.2,75.2,76.2,77.2,78.2,79.2,80.2,81.2,82.2,83.2,84.2,85.2,86.2,87.2,88.2,89.2,90.2,91.2,92.2,93.2,94.2,95.2,96.2,97.2,98.2,99.2
0,862,0.495806,1.448255,0.891447,-0.287562,-0.406398,-1.275137,-0.362007,1.161994,-0.403363,-0.079192,-0.419418,-0.609252,-0.084679,-0.299261,-0.014496,-0.274076,1.485301,-0.380626,0.35677,-1.860791,0.680544,1.337196,0.136532,-0.687902,-0.449632,0.494139,-1.582103,-0.201983,0.082339,0.350844,0.366524,0.36322,0.126465,-1.239939,0.093559,1.892473,-0.353858,-0.94166,-0.442541,-2.148511,0.093138,-0.909997,0.749275,0.73551,-0.193813,-1.187146,-0.691451,-0.925867,-0.655156,0.40344,-0.083754,-0.891207,-0.395296,-1.040046,-0.16124,-0.594956,-0.409569,-0.316077,-1.089585,0.505115,0.077344,-0.002151,0.009679,1.125123,-0.081597,0.864145,0.709746,-0.430825,-1.555257,0.013394,-2.220542,-0.524707,0.550566,-0.209737,1.13554,-0.020349,-0.451137,-0.061292,-0.062551,0.293149,-0.482371,1.143046,-0.597678,1.474816,-0.835758,0.400068,0.773842,1.595554,0.859841,0.108987,0.560503,-0.93442,0.812958,-0.476148,0.897804,-0.933257,-0.883437,0.083045,-0.126825,0.489892,0.010273,0.817648,0.634139,0.064504,-0.139594,-1.374062,0.19167,1.151687,-0.186589,-0.471425,-0.087771,-1.131396,-0.106744,-0.003237,0.401782,-0.438085,0.408385,-1.165699,0.4575,-1.060441,0.624667,0.688522,0.64195,-0.27972,-0.245806,0.365756,-0.892511,-0.160167,-0.457349,0.196876,0.340872,0.40838,-0.094786,-0.472401,-0.118263,0.888826,-0.254706,-0.638613,-0.689231,-1.192017,0.102563,-0.546408,0.138354,0.137194,0.170622,-0.456468,-0.654755,-0.071106,0.15252,0.608446,0.198214,-0.485435,-0.08813,-0.41319,-0.45308,0.329939,0.223227,0.28752,-0.346825,-0.083561,-0.129311,0.34639,-0.202848,0.105761,-0.601579,0.487874,-0.243105,0.179237,-0.892699,0.495319,-0.863223,0.446231,0.806208,-0.367043,0.736964,0.318612,0.058537,-0.27103,-0.659621,0.402388,-0.674195,0.251463,-0.322102,1.177188,-0.211923,0.072287,0.019434,0.903923,0.99113,0.475097,0.76171,-0.253769,0.050682,-0.018734,0.784636,0.114753,0.303759,-0.534547,0.138582,0.190755,0.00658,0.050562,0.042159,0.002902,-0.006681,-0.082381,0.00052,0.070367,-0.009425,-0.021532,-0.013027,-0.053693,0.000774,-0.007648,0.019815,-0.026922,0.019735,-0.066364,0.028586,-0.065547,0.035299,0.045478,0.040643,-0.027544,-0.017282,0.010032,-0.046986,0.003749,-0.019555,0.017475,0.019257,0.026409,-0.009899,-0.023559,-0.008478,0.06459,-0.014694,-0.042117,-0.040501,-0.060161,0.00042,-0.022482,0.003914,0.023426,0.015593,-0.023917,-0.032871,-0.017032,-0.00267,0.033919,0.02197,-0.027753,-0.015201,-0.029768,-0.034677,0.00785,0.00321,0.023363,-0.016003,0.003117,0.004139,0.01602,-0.024963,0.005069,-0.036666,0.037161,-0.011343,0.007207,-0.063944,0.032758,-0.065748,0.01684,0.05482,-0.010451,0.043978,0.021965,0.004482,-0.009569,-0.028647,0.030098,-0.03869,0.019087,-0.017193,0.080019,-0.01532,0.009922,-0.006024,0.047654,0.058172,0.032042,0.043557,-0.020263,0.005618,0.001167,0.046387,0.005253,0.004573,-0.03701,0.013595,0.018958
1,8844,-0.397078,1.018687,0.421171,0.370865,-0.321981,-1.651462,0.351163,1.77978,-0.170813,-0.962608,-0.103198,-1.730743,0.264531,0.335708,0.126408,-0.713016,0.396403,-1.411154,0.226196,-0.94825,0.486397,0.393592,0.871127,-0.233695,-0.438025,0.46718,-1.345135,-0.230717,-0.91692,0.124284,0.047088,0.0151,0.010721,-0.250371,-0.209399,0.961506,0.093159,-0.767663,-0.675481,-1.203307,-0.178985,-0.444842,-0.089397,-0.314265,0.449891,-0.264172,-0.651874,0.28401,0.262969,0.781978,0.148049,-0.305723,-0.172441,-0.794107,-0.755511,0.306092,0.65347,0.116148,-0.685608,0.028084,-0.204768,0.468061,-0.167325,0.043729,-0.705691,0.408482,-0.222136,0.349235,-1.010589,0.764775,-0.950322,0.446663,1.10832,-0.741156,1.111223,0.303402,-0.007641,-0.354665,-0.902534,0.366648,-0.708714,0.188589,0.097446,1.334727,-0.238,-0.102143,0.096789,1.00195,0.798981,0.718699,1.154938,0.329313,0.30252,0.349432,1.315677,0.478088,0.495919,-0.562408,-0.16682,0.323755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.006774,0.341107,0.245614,0.03256,-0.05539,-0.535898,0.067674,0.439353,-0.095824,-0.181125,-0.04154,-0.432236,-0.045736,-0.002815,0.166083,-0.18359,0.161901,-0.466457,0.174504,-0.411152,0.251831,0.284175,0.257736,-0.100288,-0.086513,0.150012,-0.341959,-0.049884,-0.174268,0.082588,0.149077,0.166288,-0.019666,-0.185902,-0.043903,0.347622,-0.107558,-0.244161,-0.273271,-0.452345,0.035667,-0.196916,0.046795,0.073504,0.084056,-0.169736,-0.263698,-0.039837,0.050923,0.24061,0.086148,-0.182331,-0.047751,-0.167853,-0.173641,0.130291,0.088243,0.11897,-0.149479,-0.02832,-0.048348,0.126405,-0.087078,0.06078,-0.23713,0.188412,-0.096911,0.073451,-0.344705,0.19359,-0.34929,0.174748,0.303299,-0.123108,0.288137,0.117767,0.035661,-0.1069,-0.242971,0.156374,-0.274107,0.102294,-0.139407,0.46813,-0.081213,0.026636,0.017636,0.339455,0.392892,0.180363,0.291666,-0.096068,0.012384,-0.012453,0.312412,0.049888,0.111094,-0.219513,0.067427,0.091513
2,949,0.052602,0.968764,0.743834,0.042009,-0.156976,-1.599201,0.164784,1.273759,-0.248381,-0.522725,-0.082148,-1.270497,-0.180233,-0.030896,0.562302,-0.498499,0.473651,-1.396618,0.538185,-1.285656,0.785951,0.81736,0.748729,-0.343553,-0.27311,0.416989,-0.985768,-0.156565,-0.487289,0.23259,0.398597,0.521634,-0.087663,-0.545863,-0.166739,1.073672,-0.350832,-0.765168,-0.841111,-1.40817,0.140489,-0.620907,0.177218,0.200787,0.218749,-0.567838,-0.764355,-0.12405,0.211226,0.708363,0.257911,-0.562109,-0.093471,-0.462171,-0.52269,0.388474,0.199649,0.377662,-0.382506,-0.088534,-0.116424,0.423464,-0.295968,0.141997,-0.670809,0.589364,-0.307548,0.235345,-1.019459,0.602707,-1.076743,0.525208,0.957878,-0.335939,0.853379,0.350432,0.12536,-0.309252,-0.694995,0.490283,-0.777541,0.314931,-0.477347,1.395665,-0.23496,0.078482,0.030937,1.000527,1.225457,0.510584,0.913515,-0.323787,0.03981,-0.051025,0.916512,0.093187,0.342238,-0.668847,0.200821,0.217365,0.004265,0.038662,0.028493,-0.004112,-0.01345,-0.054563,0.00236,0.053096,-0.011929,-0.025299,-0.000697,-0.049986,0.002722,0.002374,0.01813,-0.009175,0.014973,-0.049028,0.006259,-0.031576,0.019791,0.02957,0.034859,-0.012872,-0.006816,0.02266,-0.037387,-0.014634,-0.021872,0.016141,0.013832,0.003835,0.006664,-0.020331,-0.004987,0.028455,-0.001662,-0.021276,-0.017918,-0.048245,0.010092,-0.01627,0.010682,0.009595,0.017037,-0.016787,-0.022372,0.000179,0.001341,0.030404,0.013417,-0.021803,0.001932,-0.010464,-0.0255,0.009666,0.010655,0.004062,-0.022652,-0.005362,-0.009351,0.010838,-0.014435,0.012327,-0.026571,0.024444,0.001403,0.015391,-0.025815,0.022149,-0.027369,0.011909,0.02372,-0.014814,0.031327,0.011829,0.006775,-0.013792,-0.027293,0.022214,-0.022762,0.001877,-0.016081,0.041598,-0.006773,0.001293,-0.000709,0.034238,0.045284,0.01775,0.031219,-0.005937,0.010788,-0.008323,0.036019,0.0061,0.009258,-0.023653,0.013241,0.017825,-0.001551,0.534304,0.388279,0.050776,-0.083054,-0.883135,0.11248,0.756176,-0.118708,-0.304898,-0.053018,-0.745069,-0.05106,0.009642,0.259213,-0.292205,0.24429,-0.760748,0.273652,-0.670209,0.409129,0.428624,0.412669,-0.182751,-0.150672,0.239883,-0.563275,-0.101661,-0.301517,0.115625,0.20817,0.260287,-0.071704,-0.285412,-0.095011,0.580045,-0.149067,-0.428409,-0.437553,-0.761063,0.044037,-0.347468,0.077408,0.061079,0.116266,-0.2842,-0.413884,-0.030242,0.09656,0.39448,0.131649,-0.314363,-0.048931,-0.284353,-0.29597,0.191338,0.152734,0.178917,-0.23709,-0.038112,-0.077388,0.215318,-0.144779,0.070211,-0.391047,0.311623,-0.164217,0.119711,-0.560935,0.337735,-0.545906,0.288476,0.518064,-0.22671,0.483788,0.206937,0.057336,-0.179178,-0.410915,0.271379,-0.431525,0.158758,-0.212641,0.751673,-0.117806,0.026751,0.030345,0.568091,0.642228,0.306425,0.508689,-0.127931,0.04617,-0.004904,0.52217,0.09665,0.220256,-0.350484,0.080222,0.134068
3,710,0.075551,0.779607,0.618968,0.045524,-0.157972,-1.259439,0.111243,1.0447,-0.277452,-0.395732,-0.046745,-1.009941,-0.145521,0.026453,0.452022,-0.355185,0.421031,-1.125742,0.443081,-1.043978,0.65724,0.65263,0.588185,-0.321513,-0.267382,0.32177,-0.796791,-0.103363,-0.331893,0.201794,0.33753,0.414367,-0.067887,-0.438597,-0.093576,0.933517,-0.294185,-0.632659,-0.696421,-1.128894,0.130495,-0.53293,0.156709,0.224977,0.192376,-0.493313,-0.605969,-0.157612,0.108113,0.60264,0.223683,-0.441843,-0.090391,-0.353454,-0.421664,0.305501,0.149738,0.313404,-0.312986,-0.111835,-0.079602,0.327647,-0.26,0.148091,-0.555075,0.489147,-0.235575,0.197561,-0.862412,0.498866,-0.895979,0.397607,0.761343,-0.229341,0.705892,0.283312,0.116512,-0.25234,-0.537283,0.407064,-0.670925,0.262728,-0.419731,1.168258,-0.211764,0.089424,0.01869,0.772493,0.989538,0.413322,0.733047,-0.317057,0.013438,-0.066758,0.707746,-0.005829,0.219767,-0.528112,0.179033,0.19755,0.036258,0.800348,0.594923,0.071239,-0.137877,-1.254762,0.138908,1.028132,-0.212156,-0.40293,-0.05976,-1.03174,-0.133851,-0.016982,0.405678,-0.364027,0.385716,-1.094357,0.406612,-0.991286,0.62512,0.678915,0.595587,-0.278156,-0.225804,0.342471,-0.793851,-0.141393,-0.380992,0.185554,0.337617,0.43714,-0.104947,-0.468374,-0.08714,0.897366,-0.283872,-0.585137,-0.679931,-1.10061,0.130992,-0.502917,0.160687,0.177803,0.170123,-0.443258,-0.592432,-0.09681,0.099318,0.578273,0.228115,-0.46621,-0.065927,-0.369724,-0.424194,0.314777,0.151192,0.317631,-0.314424,-0.079545,-0.079303,0.311984,-0.21169,0.142015,-0.533502,0.463465,-0.224407,0.188362,-0.865422,0.462183,-0.849565,0.398565,0.76589,-0.287836,0.697805,0.298541,0.079765,-0.254545,-0.57275,0.390819,-0.637588,0.237141,-0.357396,1.10404,-0.202079,0.10837,0.034934,0.780403,0.932541,0.428033,0.711553,-0.295861,0.019829,-0.040021,0.696443,0.040584,0.234404,-0.507362,0.145973,0.176103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**4 Напишем аналог функции train_test_split для пользователя и его предпочтений**


Подаем на вход функции юзера и получаем два списка фильмов: на тренировочном будем учить, а фильмы из тестового набора искать в предсказаниях

In [None]:
def train_test_split_films(dataframe, user_list, size=0.7):

  '''
     на вход подается:
     - dataframe - название датафрейма с рейтингами
     - user_list - список пользователей
     - size - пропорция разделения

     на выходе для всех пользователей получаем один словарь, где
     ключ - id зрителя
     список 1 - айдишники train фильмов
     список 2 - оценки train фильмов
     список 3 - айдишники test фильмов
     список 4 - оценки test фильмов
  '''

  user_tts_data_dict = {}

  # итерируемся по всем пользователям
  for user in user_list:

    # выбираю данные по нужному юзеру и перемешиваю
    temp_df = dataframe[dataframe['userId'] == user] \
              .sample(dataframe[dataframe['userId'] == user].shape[0]) \
              [['movieId', 'rating']]

    # отфильтровываем пользователей с количеством просмотров < 10
    if temp_df.shape[0] < 10:
      continue

    # вычисляю длину тренировочного набора для нужного юзера
    shape = int(temp_df.shape[0]*size)

    # сразу записываем в словарь четыре вектора
    user_tts_data_dict[user] = [temp_df.iloc[:shape]['movieId'].tolist(),
                                temp_df.iloc[:shape]['rating'].tolist(),
                                temp_df.iloc[shape:]['movieId'].tolist(),
                                temp_df.iloc[shape:]['rating'].tolist()]

  return user_tts_data_dict

In [None]:
# пользователей слишком много, мы (в силу отсутствия мощностей) не сможем 
# построить предсказания по всем, но можно выбрать сэмпл
len(ratings['userId'].unique())

258025

In [None]:
# вторым параметром на вход можно подать любой список id пользователей
splitted_data = train_test_split_films(ratings, ratings['userId'].unique()[:1000], 0.7)

Мы получили словарь, в котором для каждого пользователя, который посмотрел как минимум 10 фильмов, есть тренировочный набор фильмов и оценки к ним + тестовый набор фильмов и оценки к ним

**5 Для каждого пользователя по train-фильмам находим средний вектор предпочтений**

In [None]:
def get_avg_weightened_vec(dict_name, movies_emb_df):

  '''
     на вход подается:
     - dict_name - название словаря с данными после сплита 
     - movies_emb_df - название эмбединг-датасета с фильмами

     далее мы находим все эмбединг-вектора для train фильмов, домножаем
     каждый на оценку от зрителя и находим средний вектор для каждого зрителя

     на выходе:
     - словарь с ключами - айди пользователей и значениями - средним взвешенным 
     вектором по просмотренным фильмам в train-наборе по каждому пользователю
  '''

  user_vec_data_dict = {}

  # итерируемся по всем пользователям
  for key in dict_name.keys():

    # находим все вектора фильмов из train-выборки в эмбединг-матрице
    # и джойним с их рейтингами от выбранного пользователя
    temp_df = movies_emb_df \
              .loc[movies_emb_df['movieId'].isin(dict_name[key][0])] \
              .merge(pd.DataFrame(dict_name[key][0], dict_name[key][1], 
                                  columns=['movieId'])
                                  .reset_index()
                                  .rename(columns={'index':'rating'}), on='movieId')

    # домножаем эмбединг-вектора на веса-оценки и вычисляем средний вектор
    avg_weightened_vec = list(temp_df \
                              .drop(['movieId'], axis=1)
                              .mul(temp_df['rating'], axis=0)
                              .drop(['rating'], axis=1)
                              .sum()
                              .mul(1 / temp_df['rating'].sum(), axis=0))
    
    # все вектора записываем в словарь под ключ-айди пользователя
    user_vec_data_dict[key] = avg_weightened_vec

  return user_vec_data_dict

In [None]:
# получим матрицу пользователей
res_vec_dict = get_avg_weightened_vec(splitted_data, matrix_emb)

In [None]:
# преобразуем к табличному виду
res_vec_df = pd.DataFrame.from_dict(res_vec_dict, orient='index').reset_index().rename(columns={'index':'Id'})

In [None]:
res_vec_df.head(4)

Unnamed: 0,Id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,1,-0.018368,0.23834,0.16097,-0.048378,-0.029982,-0.525233,-0.017152,0.290651,-0.040276,-0.131653,-0.011333,-0.340068,-0.078016,-0.090914,0.247114,-0.082978,0.095844,-0.433282,0.118622,-0.471143,0.228818,0.215416,0.290279,-0.065565,-0.076284,0.167153,-0.266771,-0.024868,-0.125681,0.041959,0.069214,0.207843,0.073805,-0.052684,-0.101087,0.274677,-0.133384,-0.252315,-0.272264,-0.443344,-0.004165,-0.174596,-0.026027,0.098561,0.043315,-0.237325,-0.248838,-0.047629,0.133941,0.151715,0.037596,-0.161846,-0.005012,-0.18268,-0.096765,0.092055,-0.006946,0.10429,-0.026857,0.012767,-0.009293,0.106715,-0.10437,-0.005028,-0.17041,0.201428,-0.109255,0.129799,-0.228212,0.233809,-0.338057,0.151667,0.274829,-0.008511,0.208413,0.082469,0.026558,-0.148393,-0.176963,0.140897,-0.180698,0.137688,-0.173312,0.367784,-0.007557,-0.033438,0.00875,0.267498,0.469679,0.083572,0.362368,-0.042343,0.064512,-0.000737,0.318336,0.054719,0.143027,-0.201326,0.111604,0.033874,-0.000222,0.224015,0.165439,0.01725,-0.035996,-0.367892,0.043193,0.309592,-0.057039,-0.127707,-0.022408,-0.304493,-0.026002,-0.002403,0.103887,-0.121574,0.102605,-0.323664,0.11378,-0.289435,0.165107,0.184499,0.176453,-0.071737,-0.061592,0.098309,-0.240653,-0.042627,-0.120824,0.053324,0.08622,0.10442,-0.019012,-0.122805,-0.037583,0.241852,-0.058859,-0.171493,-0.18463,-0.320249,0.021461,-0.144045,0.028968,0.035646,0.049943,-0.120794,-0.175937,-0.016425,0.040301,0.160879,0.055661,-0.133455,-0.026928,-0.12036,-0.128157,0.083705,0.06065,0.081572,-0.099538,-0.015363,-0.031497,0.097406,-0.060932,0.034006,-0.156772,0.132635,-0.062687,0.055493,-0.232643,0.136869,-0.236223,0.118554,0.216362,-0.094638,0.20146,0.085325,0.017303,-0.074208,-0.171397,0.111225,-0.177739,0.069762,-0.082575,0.31937,-0.050524,0.014621,0.01243,0.241501,0.267245,0.126528,0.215602,-0.056507,0.02075,-0.000355,0.221984,0.030653,0.08544,-0.1497,0.031286,0.054642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,0.003233,0.278149,0.201571,0.023384,-0.046091,-0.449282,0.050347,0.377777,-0.073783,-0.15365,-0.026814,-0.379469,-0.036733,0.000533,0.131921,-0.142637,0.134974,-0.38587,0.144293,-0.354773,0.211509,0.225647,0.212996,-0.08752,-0.08123,0.115008,-0.296731,-0.047063,-0.140017,0.063594,0.108397,0.123986,-0.016446,-0.155984,-0.042347,0.295925,-0.080504,-0.210464,-0.229369,-0.393429,0.02974,-0.174815,0.04528,0.05395,0.065985,-0.156092,-0.214526,-0.033614,0.051409,0.200325,0.070125,-0.161066,-0.031244,-0.147683,-0.152762,0.09539,0.076561,0.095246,-0.112723,-0.019074,-0.034949,0.112374,-0.068913,0.048632,-0.195848,0.165636,-0.071916,0.066964,-0.290156,0.167207,-0.292578,0.142097,0.26189,-0.103637,0.250596,0.100029,0.018817,-0.090127,-0.206111,0.136703,-0.227882,0.088795,-0.112458,0.390648,-0.063952,0.023994,0.011369,0.287859,0.325221,0.146937,0.25841,-0.081312,0.020375,-0.010265,0.268018,0.032597,0.100687,-0.181878,0.046976,0.069489,0.000333,0.169745,0.123698,0.011849,-0.027705,-0.271111,0.033564,0.23459,-0.042545,-0.096476,-0.016521,-0.232017,-0.018915,0.000143,0.076148,-0.092164,0.07997,-0.237893,0.088124,-0.213044,0.126919,0.135967,0.132348,-0.052355,-0.048573,0.073992,-0.179181,-0.029963,-0.087181,0.039439,0.062392,0.074613,-0.010621,-0.093508,-0.025729,0.180424,-0.041559,-0.128755,-0.138466,-0.236366,0.015514,-0.105475,0.024241,0.027657,0.041464,-0.092188,-0.130106,-0.018696,0.027475,0.121505,0.043455,-0.098504,-0.018751,-0.088712,-0.094447,0.0625,0.046022,0.058268,-0.071106,-0.014206,-0.026468,0.068513,-0.043085,0.027253,-0.120094,0.101595,-0.046259,0.040447,-0.176912,0.100308,-0.17837,0.089121,0.159967,-0.069276,0.151929,0.062107,0.012144,-0.056889,-0.126059,0.081962,-0.137685,0.048134,-0.063507,0.237282,-0.041574,0.013739,0.00679,0.176599,0.196132,0.092265,0.156104,-0.046255,0.012865,-0.00632,0.165082,0.023336,0.063637,-0.10882,0.026879,0.037185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,0.006512,0.301361,0.241182,-0.016396,-0.072161,-0.568357,0.043702,0.415183,-0.079553,-0.199472,-0.002652,-0.425897,-0.025197,-0.042384,0.197116,-0.13486,0.138232,-0.473426,0.166649,-0.459879,0.236069,0.254644,0.273226,-0.107462,-0.099144,0.167435,-0.331835,-0.051875,-0.148175,0.075985,0.111844,0.187139,0.004137,-0.125947,-0.064231,0.356838,-0.102495,-0.26644,-0.284204,-0.482745,0.012139,-0.207426,0.000732,0.072954,0.049194,-0.218586,-0.255711,-0.041032,0.069771,0.212372,0.054534,-0.186806,-0.041585,-0.17572,-0.155949,0.116013,0.034873,0.104271,-0.086307,-0.008326,-0.04031,0.13083,-0.080789,0.020357,-0.244018,0.211283,-0.126077,0.080639,-0.305806,0.207519,-0.330052,0.202278,0.307919,-0.075169,0.276273,0.1325,0.026549,-0.151321,-0.241256,0.171722,-0.212432,0.12208,-0.139448,0.462786,-0.078795,0.013498,0.00156,0.333096,0.43643,0.156728,0.34809,-0.070299,0.036684,0.001122,0.315597,0.048084,0.137521,-0.227986,0.074426,0.047029,0.009535,0.314189,0.23516,0.018238,-0.051962,-0.50192,0.051189,0.417276,-0.085759,-0.16729,-0.024442,-0.421365,-0.045109,0.001806,0.159248,-0.159708,0.150455,-0.437867,0.16654,-0.407732,0.238909,0.257545,0.238627,-0.10864,-0.095821,0.132887,-0.323062,-0.048789,-0.154117,0.07184,0.122163,0.148977,-0.022163,-0.177416,-0.045812,0.336424,-0.089792,-0.24038,-0.25819,-0.439406,0.040496,-0.196353,0.053985,0.064078,0.07078,-0.174474,-0.244552,-0.042344,0.052798,0.222371,0.086908,-0.183702,-0.033268,-0.157436,-0.168582,0.114798,0.075239,0.112376,-0.130992,-0.021898,-0.038526,0.125514,-0.089405,0.058158,-0.221173,0.1851,-0.082762,0.075002,-0.331757,0.190051,-0.336827,0.161083,0.298844,-0.117914,0.273118,0.113206,0.023809,-0.103427,-0.230852,0.151566,-0.250754,0.096185,-0.130398,0.439967,-0.076786,0.021539,0.015894,0.321341,0.374868,0.17717,0.290495,-0.10246,0.018523,-0.01679,0.300805,0.038364,0.109123,-0.205819,0.054601,0.071405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8,0.017075,0.47547,0.358757,0.006549,-0.092142,-0.82155,0.06717,0.645229,-0.127282,-0.279902,-0.025076,-0.646657,-0.044441,-0.034797,0.265748,-0.223783,0.23598,-0.689201,0.252945,-0.665892,0.367122,0.397716,0.394002,-0.16104,-0.151269,0.232332,-0.515481,-0.079245,-0.225794,0.110684,0.172921,0.247165,-0.006404,-0.238358,-0.082668,0.536818,-0.151061,-0.384855,-0.410879,-0.712337,0.033503,-0.306185,0.044903,0.106632,0.086483,-0.303411,-0.380252,-0.066665,0.089913,0.329054,0.096709,-0.283764,-0.060938,-0.266738,-0.245137,0.16668,0.0892,0.154437,-0.170801,-0.014082,-0.06171,0.188466,-0.121646,0.072386,-0.354911,0.311848,-0.153954,0.113066,-0.488289,0.303244,-0.526141,0.263825,0.465995,-0.147794,0.436016,0.181611,0.036757,-0.189807,-0.357478,0.244602,-0.358578,0.175893,-0.20745,0.695432,-0.118342,0.030476,0.016368,0.506022,0.611095,0.248,0.489101,-0.131305,0.050947,-0.011622,0.473025,0.05294,0.181577,-0.316888,0.092745,0.095767,0.001602,0.239775,0.181696,0.020929,-0.037313,-0.385957,0.043705,0.327506,-0.064951,-0.133758,-0.018344,-0.325405,-0.033048,0.002284,0.112735,-0.127331,0.11244,-0.334676,0.126361,-0.302132,0.179327,0.198867,0.179791,-0.085308,-0.071488,0.102761,-0.253281,-0.03956,-0.118777,0.052903,0.091536,0.108314,-0.019182,-0.136574,-0.033405,0.253723,-0.065313,-0.185389,-0.196762,-0.336645,0.029859,-0.147779,0.04317,0.047885,0.05747,-0.12704,-0.184349,-0.031516,0.037868,0.174918,0.068791,-0.143285,-0.030695,-0.124147,-0.134211,0.083412,0.066173,0.085508,-0.101584,-0.014394,-0.033251,0.09851,-0.065761,0.046724,-0.171645,0.13649,-0.061038,0.051769,-0.255972,0.14411,-0.259229,0.119489,0.229172,-0.095365,0.212877,0.083742,0.019552,-0.07573,-0.178676,0.115958,-0.199009,0.072035,-0.095801,0.336766,-0.062065,0.018724,0.010605,0.245059,0.280817,0.135448,0.219106,-0.075035,0.013408,-0.013225,0.232121,0.027662,0.084248,-0.154222,0.040984,0.060438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Теперь у нас для каждого пользователя готов вектор его предпочтений, который можно сравнить с фильмами и порекомендовать наиболее подходящие

**6 Найдем топ близких фильмов, которые пользователь еще не смотрел**

Мы будем использовать косинусное расстояние между вектором предпочтений пользователя и векторами фильмов. Косинусное расстояние – это мера похожести двух векторов. Скалярное произведение векторов и косинус угла θ между ними связаны следующим соотношением: a * b = ||a|| ||b|| cosθ. Имея два вектора A и B, получаем косинусное расстояние – cos(θ)

In [None]:
def get_closest_films(usr_avg_vec_df, user_id, films_emb_mtrx, user_data_dict, top_n=15):

  '''
     на вход подается:
     - usr_avg_vec_df - матрица с train-предпочтениями пользователя
     - user_id - айди пользователя, для которого мы хотим получить рекомендации
     - films_emb_mtrx - эмбединг матрица фильмов
     - splitted_data_dict - словарь с train_test data для всех пользователей
     - top_n - количество рекомендуемых фильмов

     на выходе:
     - список наиболее подходящих непросмотренных фильмов
  '''

  scores = {}

  # достаем средний вектор предпочтений пользователя
  user_avg_vec = usr_avg_vec_df[usr_avg_vec_df['Id'] == user_id] \
                                .drop('Id', axis=1) \
                                .values \
                                .flatten() \
                                .tolist()

  # итерируемся по списку фильмов и находим косинусное расстояние между
  # средним вектором пользователя и каждым фильмом
  for film in films_emb_mtrx['movieId']:

    film_vec = films_emb_mtrx[films_emb_mtrx['movieId'] == film] \
                              .drop('movieId', axis=1) \
                              .values \
                              .flatten() \
                              .tolist()

    # промежуточные результаты записываем в словарь
    scores[film] = 1 - spatial.distance.cosine(user_avg_vec, film_vec)

  # удаляем из словаря фильмы, которые пользователь уже смотрел (в train-наборе)
  for viewed_film in user_data_dict[user_id][0]:
    del scores[viewed_film]

  # возвращаем топ фильмов в порядке убывания близости к вектору пользователя
  # записываем их в пятый список под айди пользователя в словаре с train-test-data
  user_data_dict[user_id].append(heapq.nlargest(top_n, scores, key=scores.get))

In [None]:
# дополним словарь предсказаниями по каждому пользователю
for i, user in enumerate(splitted_data.keys()):
  print('Running:', i + 1, '/', len(splitted_data))
  get_closest_films(res_vec_df, user, matrix_emb, splitted_data)

Running: 1 / 544


  dist = 1.0 - uv / np.sqrt(uu * vv)


Running: 2 / 544
Running: 3 / 544
Running: 4 / 544
Running: 5 / 544
Running: 6 / 544
Running: 7 / 544
Running: 8 / 544
Running: 9 / 544
Running: 10 / 544
Running: 11 / 544
Running: 12 / 544
Running: 13 / 544
Running: 14 / 544
Running: 15 / 544
Running: 16 / 544
Running: 17 / 544
Running: 18 / 544
Running: 19 / 544
Running: 20 / 544
Running: 21 / 544
Running: 22 / 544
Running: 23 / 544
Running: 24 / 544
Running: 25 / 544
Running: 26 / 544
Running: 27 / 544
Running: 28 / 544
Running: 29 / 544
Running: 30 / 544
Running: 31 / 544
Running: 32 / 544
Running: 33 / 544
Running: 34 / 544
Running: 35 / 544
Running: 36 / 544
Running: 37 / 544
Running: 38 / 544
Running: 39 / 544
Running: 40 / 544
Running: 41 / 544
Running: 42 / 544
Running: 43 / 544
Running: 44 / 544
Running: 45 / 544
Running: 46 / 544
Running: 47 / 544
Running: 48 / 544
Running: 49 / 544
Running: 50 / 544
Running: 51 / 544
Running: 52 / 544
Running: 53 / 544
Running: 54 / 544
Running: 55 / 544
Running: 56 / 544
Running: 57 / 544


Мы получили топ n предсказаний для каждого пользователя, теперь можно переходить к оценки результатов по основным метрикам

**7 Напишем функции для метрик и оцениваем алгоритм**

Precision можно интепретировать как долю релевантных рекомендаций, мы дополнительно считаем среднее по всем пользователям

Precision = TP / (TP + FP), где TP - фильмы из тестовой выборки, совпавшие с рекомендациями, а TP + FP - это все рекомендованные объекты, то есть K - количество предсказанных объектов

In [None]:
def mean_precision(user_data_dict):

  '''
     на вход подается:
     - user_data_dict - cловарь с данными о пользователях
     (нас интересует список с индексом 2 - test фильмы и с индексом 4 - рекомендации)

     k берется таким, для которого мы рассчитывали рекомендации (длина рекомендаций)

     на выходе:
     средняя метрика precision@k по всем пользователям
  '''

  sum_precision = 0

  # вычисляем метрику по каждому пользователю
  for user in user_data_dict.keys():
    TP = len(set(user_data_dict[user][4]).intersection(set(user_data_dict[user][2])))
    k = len(user_data_dict[user][4])

  # находим среднее по пользователям
  return sum_precision / len(user_data_dict)

Recall интерпретируется как доля релевантных объектов, попавших в рекомендации, мы дополнительно считаем среднее по всем пользователям

Recall = TP / (TP + FN), где TP - фильмы из тестовой выборки, совпавшие с рекомендациями, а TP + FN - это все объекты из тестовой выборки

In [None]:
def mean_recall(user_data_dict):

  '''
     на вход подается:
     - user_data_dict - cловарь с данными о пользователях
     (нас интересует список с индексом 2 - test фильмы и с индексом 4 - рекомендации)

     k берется таким, для которого мы рассчитывали рекомендации (длина рекомендаций)

     на выходе:
     средняя метрика recall@k по всем пользователям
  '''

  sum_recall = 0

  # вычисляем метрику по каждому пользователю
  for user in user_data_dict.keys():
    TP = len(set(user_data_dict[user][4]).intersection(set(user_data_dict[user][2])))
    TP_FN = len(user_data_dict[user][2])
    
    sum_recall += TP / TP_FN

  # находим среднее по пользователям
  return sum_recall / len(user_data_dict)

Mean Reciprocal Rank - средний обратный ранк первого правильного вхождения рекомендации в тестовые данные

In [None]:
def MRR(user_data_dict):

  '''
     RR - обратный ранг, 1 / индеск первого вхождения рекомендации в тест

     на вход подается:
     - user_data_dict - cловарь с данными о пользователях
     (нас интересует список с индексом 2 - test фильмы и с индексом 4 - рекомендации)

     k берется таким, для которого мы рассчитывали рекомендации (длина рекомендаций)

     на выходе:
     метрика MRR@k по всем пользователям
  '''

  sum_rr = 0

  # находим порядковый номер первого вхождения правильной рекомендации в тест
  for user in user_data_dict.keys():
    rank = [1 + splitted_data[user][4].index(x) for x in \
            set(splitted_data[user][4]).intersection(set(splitted_data[user][2]))]
    
    # находим обратный ранк, если попадания нет, то зануляем
    if len(rank) == 0:
      rr = 0
    else:
      rr = 1 / min(rank)

    sum_rr += rr

  # находим среднее по пользователям
  return sum_rr / len(user_data_dict)

Average Precision дополнительно вознаграждает нас за предварительну загрузку рекомендаций, которые будут правильными, то есть здесь, в отличие от Precision, учитывается порядок выдаваемых рекомендаций и их точность отдельно. MAP - это среднее AP по всем пользователям

In [None]:
def mean_average_precision_at_k(user_data_dict):

  '''
     Mean Average Precision - средняя точность по пользователям

     на вход подается:
     - user_data_dict - cловарь с данными о пользователях
     (нас интересует список с индексом 2 - test фильмы и с индексом 4 - рекомендации)

     k берется таким, для которого мы рассчитывали рекомендации (длина рекомендаций)

     на выходе:
     метрика MAP@k по всем пользователям
  '''

  sum_ap = 0

  # итерируемся по пользователям
  for user in user_data_dict.keys():

    num_hits = 0
    score = 0

    for i, p in enumerate(user_data_dict[user][4]):

      if p in user_data_dict[user][2] and p not in user_data_dict[user][4][:i]:
        num_hits += 1
        score += num_hits / (i + 1)

    sum_ap += score / min(len(user_data_dict[user][2]), len(user_data_dict[user][4]))

  # находим среднее по пользователям
  return sum_ap / len(user_data_dict)

**Результат:**

In [None]:
print('Mean Precision: ', round(mean_precision(splitted_data), 6))
print('Mean Recall: ', round(mean_recall(splitted_data), 6))
print('Mean Reciprocal Rank:', round(MRR(splitted_data), 6))
print('Mean Average Precision at K:', round(mean_average_precision_at_k(splitted_data), 6))

Mean Precision:  0.0
Mean Recall:  0.002933
Mean Reciprocal Rank: 0.007349
Mean Average Precision at K: 0.000636
