In [70]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [71]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
moviess = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [72]:
movies_join_ratings = ratings.join(movies.set_index('movieId'), on='movieId')
# movies_join_ratings

In [73]:
sum_rating_by_movies = movies_join_ratings.groupby(['movieId'])[['rating']].sum().sort_values('movieId')
sum_rating_by_movies = sum_rating_by_movies.reset_index()
# sum_rating_by_movies

In [74]:
count_rating_by_movies = movies_join_ratings.groupby(['movieId'])[['rating']].count().sort_values('movieId')
count_rating_by_movies = count_rating_by_movies.reset_index()
# count_rating_by_movies

In [75]:
avg_rating_by_movies = movies_join_ratings.groupby(['movieId'])[['rating']].mean().sort_values('movieId')
avg_rating_by_movies = avg_rating_by_movies.reset_index()
# avg_rating_by_movies

In [76]:
# добавляем rating, count, mean
moviess['sum'] = sum_rating_by_movies['rating']
moviess['count'] = count_rating_by_movies['rating']
moviess['mean'] = avg_rating_by_movies['rating']

moviess

Unnamed: 0,movieId,title,genres,sum,count,mean
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,843.0,215.0,3.920930
1,2,Jumanji (1995),Adventure|Children|Fantasy,377.5,110.0,3.431818
2,3,Grumpier Old Men (1995),Comedy|Romance,169.5,52.0,3.259615
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,16.5,7.0,2.357143
4,5,Father of the Bride Part II (1995),Comedy,150.5,49.0,3.071429
5,6,Heat (1995),Action|Crime|Thriller,402.5,102.0,3.946078
6,7,Sabrina (1995),Comedy|Romance,172.0,54.0,3.185185
7,8,Tom and Huck (1995),Adventure|Children,23.0,8.0,2.875000
8,9,Sudden Death (1995),Action,50.0,16.0,3.125000
9,10,GoldenEye (1995),Action|Adventure|Thriller,461.5,132.0,3.496212


In [77]:
moviess_join_ratings = ratings.join(moviess.set_index('movieId'), on='movieId')
# moviess_join_ratings

In [78]:
# выбираем фильмы для пользователя 1
moviess_users1 = moviess_join_ratings[moviess_join_ratings['userId']==1]
moviess_users1

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,sum,count,mean
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,843.0,215.0,3.920930
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,169.5,52.0,3.259615
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,402.5,102.0,3.946078
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,807.0,203.0,3.975369
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,864.5,204.0,4.237745
5,1,70,3.0,964982400,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller,193.0,55.0,3.509091
6,1,101,5.0,964980868,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance,87.0,23.0,3.782609
7,1,110,4.0,964982176,Braveheart (1995),Action|Drama|War,955.5,237.0,4.031646
8,1,151,5.0,964984041,Rob Roy (1995),Action|Drama|Romance|War,156.0,44.0,3.545455
9,1,157,5.0,964984100,Canadian Bacon (1995),Comedy|War,31.5,11.0,2.863636


In [79]:
del moviess_users1['userId']
del moviess_users1['timestamp']

In [80]:
moviess_users1

Unnamed: 0,movieId,rating,title,genres,sum,count,mean
0,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,843.0,215.0,3.920930
1,3,4.0,Grumpier Old Men (1995),Comedy|Romance,169.5,52.0,3.259615
2,6,4.0,Heat (1995),Action|Crime|Thriller,402.5,102.0,3.946078
3,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,807.0,203.0,3.975369
4,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,864.5,204.0,4.237745
5,70,3.0,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller,193.0,55.0,3.509091
6,101,5.0,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance,87.0,23.0,3.782609
7,110,4.0,Braveheart (1995),Action|Drama|War,955.5,237.0,4.031646
8,151,5.0,Rob Roy (1995),Action|Drama|Romance|War,156.0,44.0,3.545455
9,157,5.0,Canadian Bacon (1995),Comedy|War,31.5,11.0,2.863636


In [81]:
moviess_users1_join_tags = moviess_users1.join(tags.set_index('movieId'), on='movieId')

In [82]:
# moviess_users1_join_tags
moviess_users1_join_tags.tag.unique().shape

(414,)

In [83]:
tag_strings = []

for movie, group in moviess_users1_join_tags.sort_values('movieId').groupby('movieId'):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))

In [84]:
tag_strings

['pixar pixar fun',
 'moldy old',
 'nan',
 'mystery twistending serialkiller',
 'heist twistending tricky thriller suspense mindfuck',
 'nan',
 'crime offbeatcomedy quirky',
 'Scotland revenge Oscar(BestCinematography) melgibson swordfight inspirational historical epic beautifulscenery Medieval',
 'nan',
 'nan',
 'nan',
 'school AdamSandler stoplookingatmeswan',
 'generationX quirky witty hilarious cynical independentfilm',
 'nan',
 'moviebusiness',
 'StarWars ROBOTSANDANDROIDS space classicscifi scifi StarWars darthvader lukeskywalker spaceopera scifi spaceadventure oldiebutgoodie spaceepic action classicscifi Nerd classic EPIC engrossingadventure classicscifi greatsoundtrack EPIC scifi spaceaction classic spaceaction',
 "nonlinear nonlineartimeline nonlinear philosophical neonoir multiplestorylines nonlinearnarrative noir nonlinearstoryline originalplot notablesoundtrack offensive organisedcrime organizedcrime original multiplestories outoforder Palmed'Or parody nonlineartimeline mot

In [85]:
tfidf_vectorizer_tags = TfidfVectorizer()
X_train_tfidf_tags = tfidf_vectorizer_tags.fit_transform(tag_strings)

In [86]:
# векторы тегов
tfidfs_on_tag = X_train_tfidf_tags.toarray()
tfidfs_on_tag

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [87]:
tfidfs_on_tag.shape[1]

395

даже если брать теги только к фильмам первого юзера получилось 395, а это всё равно больше,чем фильмов первого юзера (их 232)