## Моя гибридная система:

1) Возьмем 10 последних фильмов, которые смотрел пользователь

2) Каждый фильм представим вектором в векторном пространстве жанров

3) К каждому из 10 фильмов найдем по 10 наиболее близких (knn)

4) Удалим уже просмотренные фильмы и дубликаты

5) На отобранных фильмах (до 100 штук) построим SVD

## Загружаем данные

In [35]:
import pandas as pd

In [36]:
import numpy as np

In [37]:
df_movies = pd.read_csv("../lecture-1/movies.csv")
df_ratings = pd.read_csv("../lecture-1/ratings.csv")

In [38]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [39]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Строим рекомендацию на основе KNN

In [40]:
from tqdm import tqdm

In [41]:
tqdm.pandas()

In [42]:
df = df_movies

In [43]:
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [44]:
df['genres_splitted'] = df.genres.str.split('|')

In [45]:
df['genres_spaced'] = df.progress_apply(lambda r: ' '.join(r['genres_splitted']), axis=1)

100%|███████████████████████████████████████████████████████████████████████████| 9742/9742 [00:00<00:00, 12692.68it/s]


In [46]:
df.head()

Unnamed: 0,movieId,title,genres,genres_splitted,genres_spaced
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]",Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]",Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]",Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]",Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,[Comedy],Comedy


In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [48]:
tfidf = TfidfVectorizer()

In [49]:
features = tfidf.fit_transform(df['genres_spaced'])

In [50]:
columns = [(k, tfidf.vocabulary_[k]) for k in tfidf.vocabulary_]

In [51]:
columns = sorted(columns, key=lambda c: c[1])

In [52]:
columns = [c[0] for c in columns]

In [53]:
features = features.todense()

In [54]:
df_features = pd.DataFrame(features, columns=columns)

In [55]:
df_result = pd.concat((df, df_features), axis=1)

In [56]:
df_result.columns

Index(['movieId', 'title', 'genres', 'genres_splitted', 'genres_spaced',
       'action', 'adventure', 'animation', 'children', 'comedy', 'crime',
       'documentary', 'drama', 'fantasy', 'fi', 'film', 'genres', 'horror',
       'imax', 'listed', 'musical', 'mystery', 'no', 'noir', 'romance', 'sci',
       'thriller', 'war', 'western'],
      dtype='object')

In [57]:
df_result

Unnamed: 0,movieId,title,genres,genres_splitted,genres_spaced,action,adventure,animation,children,comedy,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]",Adventure Animation Children Comedy Fantasy,0.000000,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]",Adventure Children Fantasy,0.000000,0.512361,0.000000,0.620525,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]",Comedy Romance,0.000000,0.000000,0.000000,0.000000,0.570915,...,0.0,0.0,0.000000,0.0,0.0,0.821009,0.000000,0.000000,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]",Comedy Drama Romance,0.000000,0.000000,0.000000,0.000000,0.505015,...,0.0,0.0,0.000000,0.0,0.0,0.726241,0.000000,0.000000,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,[Comedy],Comedy,0.000000,0.000000,0.000000,0.000000,1.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
5,6,Heat (1995),Action|Crime|Thriller,"[Action, Crime, Thriller]",Action Crime Thriller,0.549328,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.542042,0.0,0.0
6,7,Sabrina (1995),Comedy|Romance,"[Comedy, Romance]",Comedy Romance,0.000000,0.000000,0.000000,0.000000,0.570915,...,0.0,0.0,0.000000,0.0,0.0,0.821009,0.000000,0.000000,0.0,0.0
7,8,Tom and Huck (1995),Adventure|Children,"[Adventure, Children]",Adventure Children,0.000000,0.636699,0.000000,0.771112,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
8,9,Sudden Death (1995),Action,[Action],Action,1.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
9,10,GoldenEye (1995),Action|Adventure|Thriller,"[Action, Adventure, Thriller]",Action Adventure Thriller,0.553065,0.629522,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.545730,0.0,0.0


In [58]:
df_result = df_result[[
       'movieId', 'title', 'action', 'adventure', 'animation',
       'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'fi',
       'film', 'genres', 'horror', 'imax', 'listed', 'musical', 'mystery',
       'no', 'noir', 'romance', 'sci', 'thriller', 'war', 'western'   
]]

In [59]:
df_result.head()

Unnamed: 0,movieId,title,action,adventure,animation,children,comedy,crime,documentary,drama,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
len(df_result)

9742

In [61]:
from sklearn.neighbors import NearestNeighbors

In [62]:
nn = NearestNeighbors(n_neighbors=10)

In [63]:
nn.fit(df_result[['action', 'adventure', 'animation',
       'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'fi',
       'film', 'horror', 'imax', 'listed', 'musical', 'mystery',
       'no', 'noir', 'romance', 'sci', 'thriller', 'war', 'western']])

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                 radius=1.0)

In [64]:
df_result

Unnamed: 0,movieId,title,action,adventure,animation,children,comedy,crime,documentary,drama,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,1,Toy Story (1995),0.000000,0.416846,0.516225,0.504845,0.267586,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
1,2,Jumanji (1995),0.000000,0.512361,0.000000,0.620525,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
2,3,Grumpier Old Men (1995),0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.821009,0.000000,0.000000,0.0,0.0
3,4,Waiting to Exhale (1995),0.000000,0.000000,0.000000,0.000000,0.505015,0.000000,0.0,0.466405,...,0.0,0.0,0.000000,0.0,0.0,0.726241,0.000000,0.000000,0.0,0.0
4,5,Father of the Bride Part II (1995),0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
5,6,Heat (1995),0.549328,0.000000,0.000000,0.000000,0.000000,0.635947,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.542042,0.0,0.0
6,7,Sabrina (1995),0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.821009,0.000000,0.000000,0.0,0.0
7,8,Tom and Huck (1995),0.000000,0.636699,0.000000,0.771112,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
8,9,Sudden Death (1995),1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
9,10,GoldenEye (1995),0.553065,0.629522,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.545730,0.0,0.0


In [65]:
df_joined = df_ratings.merge(df_result, on='movieId')

In [66]:
user_id = 320

In [116]:
def get_last_ten_films(user_id):
    user_films = df_joined[df_joined['userId'] == user_id]
    user_films = user_films.sort_values('timestamp', ascending=False)
    last_10 = user_films.head(10)
    
    film_names = last_10['title'].values
    film_vectors = last_10[['action', 'adventure', 'animation',
       'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'fi',
       'film', 'horror', 'imax', 'listed', 'musical', 'mystery',
       'no', 'noir', 'romance', 'sci', 'thriller', 'war', 'western']].values
    
    return film_names, film_vectors

In [92]:
def get_user_recomendation_by_knn(intereseted_films):
    res = []
    for film in intereseted_films:
        _, films = nn.kneighbors(film.reshape(1, -1))
        for v in df_movies.iloc[films[0]]['movieId'].values:
            res.append(v)
    return res
    

In [137]:
def del_seen_films(films, user_id):
    res = []
    user_films = df_joined[df_joined['userId'] == user_id]['movieId'].values
    for film in films:
        if not(film in user_films or film in res):
            res.append(film)
    return res

In [117]:
names, vectors = get_last_ten_films(user_id)

In [119]:
films_for_user = get_user_recomendation_by_knn(vectors)

In [120]:
len(films_for_user)

100

In [138]:
new_list = del_seen_films(films_for_user, user_id)

In [139]:
len(new_list)

75

## Делаем SVD

In [99]:
import surprise as s

In [100]:
df_for_surprise = df_ratings[['userId', 'movieId', 'rating']]

In [101]:
df_for_surprise.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [102]:
reader = s.reader.Reader(rating_scale=(0.5, 5))

In [103]:
dataset = s.dataset.Dataset.load_from_df(df_for_surprise, reader)

In [104]:
dataset, _ = s.model_selection.train_test_split(dataset, test_size=0.01)

In [105]:
algorithm = s.SVD()

In [106]:
algorithm.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x15e8113c550>

## Итоговая рекомендация

In [140]:
def recomend(user_id):
    names, vectors = get_last_seven_films(user_id)
    knn_recomendations = get_user_recomendation_by_knn(vectors)
    knn_recomendations = del_seen_films(knn_recomendations, user_id)
    knn_recomendations = pd.DataFrame(knn_recomendations, columns=['movieId'])
    knn_recomendations['Score'] = knn_recomendations.apply(lambda r: algorithm.predict(user_id, r['movieId']).est, axis=1)
    knn_recomendations = knn_recomendations.sort_values('Score', ascending=False)
    knn_recomendations = knn_recomendations.merge(df_movies, on='movieId')[['movieId', 'title', 'Score']]
    
    print("Last 10 films was: ")
    print(names)
    
    print("Recomendations")
    print(knn_recomendations)

In [141]:
recomend(user_id)

Last 10 films was: 
['Avatar (2009)' 'Gladiator (2000)' 'Star Trek (2009)' 'Iron Man (2008)'
 'Cowboy Bebop: The Movie (Cowboy Bebop: Tengoku no Tobira) (2001)'
 'Appleseed (Appurushîdo) (2004)' 'District 13 (Banlieue 13) (2004)']
Recomendations
    movieId                                              title     Score
0      3793                                       X-Men (2000)  3.969810
1      8636                                Spider-Man 2 (2004)  3.926241
2     27156  Neon Genesis Evangelion: The End of Evangelion...  3.902097
3    102445                     Star Trek Into Darkness (2013)  3.826627
4    110102         Captain America: The Winter Soldier (2014)  3.820609
5    167370                            Assassin's Creed (2016)  3.776688
6     70533  Evangelion: 1.0 You Are (Not) Alone (Evangerio...  3.759139
7     37830          Final Fantasy VII: Advent Children (2004)  3.721924
8    136449                      Ghost in the Shell 2.0 (2008)  3.706242
9      6721   Once Upon 

In [144]:
recomend(555)

Last 10 films was: 
['Body Parts (1991)' 'Blues Brothers 2000 (1998)' 'Bodyguard, The (1992)'
 'Blue Lagoon, The (1980)'
 'Bloodsport 2 (a.k.a. Bloodsport II: The Next Kumite) (1996)'
 'Blair Witch Project, The (1999)' 'Blazing Saddles (1974)']
Recomendations
    movieId                                              title     Score
0     44555  Lives of Others, The (Das leben der Anderen) (...  4.203231
1       891  Halloween: The Curse of Michael Myers (Hallowe...  4.196254
2      2847                      Only Angels Have Wings (1939)  3.988626
3     93006                      Very Potter Musical, A (2009)  3.890290
4      1337                          Body Snatcher, The (1945)  3.838510
5      6927                            Human Stain, The (2003)  3.823934
6      3499                                      Misery (1990)  3.821936
7      3546            What Ever Happened to Baby Jane? (1962)  3.775121
8      6973                              Final Analysis (1992)  3.734349
9      496