In [1]:
from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor

import pandas as pd
import numpy as np

In [2]:
#links = pd.read_csv('links.csv')
movies = pd.read_csv('ml-latest/movies.csv')
ratings = pd.read_csv('ml-latest/ratings.csv')
#tags = pd.read_csv('tags.csv')

In [3]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [4]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,4.0,1113766000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,5.0,948885800.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14.0,4.5,1442169000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,4.0,1370810000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,22.0,4.0,1237623000.0


In [5]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [6]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [7]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=20)

In [8]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.title] = row.genres

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index, row in tqdm_notebook(movies.iterrows()):


0it [00:00, ?it/s]

In [9]:
# Функция рекомендации на основе подобранных фильмов по жанру просмотренного фильма 
# и средних рейтингов этих фильмов, проставленных другими пользователями в порядке убывания

def recommend_for_user(user_id):
    current_user_id = user_id
    # получаем фильмы, которые смотрел юзер
    user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()
    
    last_user_movie = user_movies[-1]
    
    # получаем жанр последнего просмотренного юзером фильма
    movie_genres = title_genres[last_user_movie]
    movie_genres = change_string(movie_genres)

    predict = count_vect.transform([movie_genres])
    X_tfidf2 = tfidf_transformer.transform(predict)

    res = neigh.kneighbors(X_tfidf2, return_distance=True)
    
    # Получаем список рекомендованных фильмов
    movies_to_score = movies.iloc[res[1][0]].title.values
    
    movies_to_score = set(movies_to_score.tolist()) - set(user_movies.tolist())
            
    # Получаем рейтинги рекомендованных фильмов, проставленные другими пользователями
    rslt_df = movies_with_ratings[movies_with_ratings['title'].isin(movies_to_score)]
    # рассчитываем средный рейтинг
    rslt_df2 = rslt_df.groupby('title').mean().reset_index()
    # выводим средный рейтинг в порядке убывания
    print(rslt_df2[['title', 'rating']].sort_values('rating', ascending=False))

In [10]:
recommend_for_user(3.0)

                                                title    rating
0                                         Amal (2007)  3.916667
4                       Eve and the Fire Horse (2005)  3.875000
18                                     Wet Bum (2014)  3.750000
3   End of Summer, The (Early Autumn) (Kohayagawa-...  3.744444
6                                 Frozen River (2008)  3.651487
14                Silent Light (Stellet licht) (2007)  3.625000
12                                Multi-Facial (1995)  3.500000
15                                Summer Storm (1944)  3.500000
13             Peppermint Candy (Bakha satang) (1999)  3.473684
11                                  Love & Pop (1998)  3.466667
2                         Brideshead Revisited (2008)  3.416230
7                            Godless Girl, The (1929)  3.400000
10                  Lost in Beijing (Ping guo) (2007)  3.380952
1                                 Bottle Shock (2008)  3.366279
17                       Tracey Fragment