## Задание к теме: «Гибридные рекомендательные системы»

### Никифоров Владимир

Что делать?
-  1.Датасет ml-latest
-  2.Вспомнить подходы, которые мы разбирали
-  3.Выбрать понравившийся подход к гибридным системам
-  4.Написать свою

### Постановка задачи

По 2-м наиболее интересным для пользователя жанров (из просмотренных последних 10 фильмов) опрашиваем экспертов (топ-100 по просмотру данных жанров) на предмет общих рекомендаций топ-20 фильмов для каждого жанра, которые дальше ранжируем по средней оценке всех пользователей, с выдачей 2-х лучших фильма на просмотр в ближайшие выходные))

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import surprise as s
tqdm.pandas()

  from pandas import Panel


In [2]:
df_movies = pd.read_csv("../data/ml-latest-small/movies.csv")
df_ratings = pd.read_csv("../data/ml-latest-small/ratings.csv")

In [3]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
df_movies['genres_spaced'] = df_movies.progress_apply(lambda r: r['genres'].replace('|',' '), axis=1)
df_movies.head()

100%|██████████| 9742/9742 [00:00<00:00, 42345.54it/s]


Unnamed: 0,movieId,title,genres,genres_spaced
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,Comedy


In [6]:
tfidf = TfidfVectorizer()

In [7]:
features = tfidf.fit_transform(df_movies['genres_spaced'])

In [8]:
columns = [(k, tfidf.vocabulary_[k]) for k in tfidf.vocabulary_]

In [9]:
columns = sorted(columns, key=lambda c: c[1])

In [10]:
columns = [c[0] for c in columns]

In [11]:
features = features.todense()

In [12]:
df_features = pd.DataFrame(features, columns=columns)

In [13]:
df_result = pd.concat((df_movies, df_features), axis=1).drop(['genres', 'genres_spaced'], axis=1)
columns.remove('genres')

In [14]:
df_result.columns

Index(['movieId', 'title', 'action', 'adventure', 'animation', 'children',
       'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'fi', 'film',
       'horror', 'imax', 'listed', 'musical', 'mystery', 'no', 'noir',
       'romance', 'sci', 'thriller', 'war', 'western'],
      dtype='object')

In [15]:
df_result.head()

Unnamed: 0,movieId,title,action,adventure,animation,children,comedy,crime,documentary,drama,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df_joined = df_ratings.merge(df_result, on='movieId')

In [17]:
user_id = 320

In [18]:
def get_last_films_genre(user_id, last_film_count = 10, top_genres_count = 2):
    ''' Get genres from the most recent films '''
    user_films = df_joined[df_joined['userId'] == user_id]
    user_films = user_films.sort_values('timestamp', ascending=False)
    last_ = user_films.head(last_film_count)
    
    # Count films for each genre from this last_film_count for this user_id
    last_ = last_[columns].replace(0,np.nan).count(axis=0).reset_index()
    # Get top (top_genres_count) genres by count
    genres = last_.sort_values(0, ascending=False).head(top_genres_count)['index'].values
    
    return list(genres)

In [19]:
genres = get_last_films_genre(user_id)
genres

['action', 'fi']

In [20]:
def get_expert_recommendations(user_id, genres, expert_value = 100, expert_film_list_count = 20):
    ''' For user_id ask expert who in TOP-100 of user_id who seen films of genres about list of recommendations not from seen by user_id '''
    # Find experts (exclude user_id) in these genres
    experts = df_joined[df_joined['userId']!=user_id]
    # Count films for each user
    experts = experts.replace(0,np.nan).groupby('userId').agg('count').reset_index()
    # Get TOP experts (expert_value)
    experts = experts.sort_values(genres, ascending=False).head(expert_value)['userId'].values
    # Find films seen by experts and not seen by user_id of these genres
    seen_films = df_joined[df_joined['userId'] == user_id]['movieId'].unique()
    not_seen_films_from_experts = df_joined[(df_joined['userId'].isin(experts))][~df_joined['movieId'].isin(seen_films)]
    # Filter not_seen_films_from_expert only with genres
    movieId_same_genres = df_result[df_result['movieId'].isin(not_seen_films_from_experts['movieId'].unique())][['movieId']+genres].replace(0,np.nan).dropna()['movieId'].values
    not_seen_films_from_experts = not_seen_films_from_experts[not_seen_films_from_experts['movieId'].isin(movieId_same_genres)]
    # Find list of recommendations from experts (from films seen by experts but not seen by user_id)
    df_for_surprise = not_seen_films_from_experts[['userId', 'movieId', 'rating']]
    reader = s.reader.Reader(rating_scale=(0.5, 5))
    dataset = s.dataset.Dataset.load_from_df(df_for_surprise, reader)
    dataset, _ = s.model_selection.train_test_split(dataset, test_size=0.01)
    algorithm = s.SVD()
    algorithm.fit(dataset)
    recommendations = pd.DataFrame(movieId_same_genres, columns=['movieId'])
    recommendations['Score'] = recommendations.apply(lambda r: algorithm.predict(user_id, r['movieId']).est, axis=1)
    recommendations = recommendations.sort_values('Score', ascending=False).head(expert_film_list_count)
    
    return recommendations

In [21]:
top_by_experts = get_expert_recommendations(user_id, genres)
top_by_experts

  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,movieId,Score
30,1196,4.414209
73,2571,4.351309
7,260,4.344188
278,79132,4.238984
32,1210,4.184925
17,541,4.163679
348,112852,4.14725
19,589,4.136406
31,1200,4.103443
354,122882,4.00017


In [22]:
def rating_for_user(user_id, top_by_experts, film_count = 2):
    ''' Get only (film_count) films with highest average rating '''
    movieIds = top_by_experts['movieId'].values
    df = df_joined[df_joined['movieId'].isin(movieIds)][['movieId', 'rating']]
    # Calc mean rating for recommended films by experts and get only top (film_count) films
    df = df.groupby('movieId').agg('mean').reset_index().sort_values('rating', ascending = False).head(film_count)
    # Add title to result
    df = df.merge(df_movies, on='movieId')[['movieId', 'title', 'rating']]
    
    return df

In [23]:
# Final list of films for weekend
rating_for_user(user_id, top_by_experts)

Unnamed: 0,movieId,title,rating
0,168252,Logan (2017),4.28
1,260,Star Wars: Episode IV - A New Hope (1977),4.231076
