# Домашнее задание к теме «Гибридные рекомендательные системы»

1. Датасет ml-latest
1. Вспомнить подходы, которые мы разбирали
1. Выбрать понравившийся подход к гибридным системам
1. Написать свою рекомендательную систему

### Решение

In [1]:
from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np

In [12]:
#User information (users.dat table, data without entry information)
unames = ['user_id','gender','age','occupation','zip']
users = pd.read_table('../3.ColloborativeFiltering/users.dat', sep='::', header=None, names=unames, engine='python')

#Rating information
rnames = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_table('../3.ColloborativeFiltering/ratings.dat', sep='::', header=None, names=rnames, engine='python')

# #Movie information
mnames = ['movie_id','title','genres']
movies = pd.read_table('../3.ColloborativeFiltering/movies.dat', sep='::', header=None, names=mnames, engine='python', encoding='ISO-8859-1')

In [13]:
data=pd.merge(pd.merge(ratings,users),movies)

In [14]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000208
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   user_id     1000209 non-null  int64 
 1   movie_id    1000209 non-null  int64 
 2   rating      1000209 non-null  int64 
 3   timestamp   1000209 non-null  int64 
 4   gender      1000209 non-null  object
 5   age         1000209 non-null  int64 
 6   occupation  1000209 non-null  int64 
 7   zip         1000209 non-null  object
 8   title       1000209 non-null  object
 9   genres      1000209 non-null  object
dtypes: int64(6), object(4)
memory usage: 83.9+ MB


In [16]:
dataset = pd.DataFrame({
    'uid': data.user_id,
    'iid': data.title,
    'rating': data.rating
})

In [17]:
data.rating.max()

5

In [18]:
data.rating.min()

1

In [19]:
reader = Reader(rating_scale=(1, 5.0))
df = Dataset.load_from_df(dataset, reader)

In [20]:
trainset, testset = train_test_split(df, test_size=.15, random_state=42)

In [21]:
%%time
algo = SVD(n_factors=25, n_epochs=30)
algo.fit(trainset)

Wall time: 36.7 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11bfcd26460>

In [22]:
test_pred = algo.test(testset)

In [23]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8643


0.8642816590294677

In [24]:
algo.predict(uid=2.0, iid='Mortal Kombat (1995)').est

2.551187844920484

In [25]:
current_user_id = 2.0
user_movies = data[data.user_id == current_user_id].title.unique()

scores = []
titles = []

for movie in data.title.unique():
    if movie in user_movies:
        continue
        
    scores.append(algo.predict(uid=current_user_id, iid=movie).est)
    titles.append(movie)

In [26]:
sorted(scores)[-10:]

[4.399203615375802,
 4.401657139626693,
 4.420772760281024,
 4.4264061729882345,
 4.548477019888388,
 4.554691871614094,
 4.583089070497742,
 4.618336194844336,
 4.665555052858654,
 4.736376295122313]

In [27]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [28]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [29]:
movie_genres[0]

"Animation Children's Comedy"

In [30]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=25, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=25)

In [31]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [32]:
res

(array([[0.48797024, 0.6151026 , 0.71151155, 0.71151155, 0.7378178 ,
         0.7378178 , 0.7378178 , 0.7378178 , 0.7378178 , 0.7419569 ,
         0.7419569 , 0.75200801, 0.75200801, 0.75200801, 0.75200801,
         0.75200801, 0.75200801, 0.7758186 , 0.7758186 , 0.78182187,
         0.78182187, 0.78182187, 0.78182187, 0.78182187, 0.78182187]]),
 array([[ 363, 3420, 1779, 1058,  647, 2124, 3819, 3084, 3807, 3410, 2074,
         3397, 2728, 2553, 3324, 2105, 1110, 2104, 2899, 1936,   55, 1898,
          124,    1, 1974]], dtype=int64))

In [33]:
movies.iloc[res[1][0]]

Unnamed: 0,movie_id,title,genres
363,367,"Mask, The (1994)",Comedy|Crime|Fantasy
3420,3489,Hook (1991),Adventure|Fantasy
1779,1848,"Borrowers, The (1997)",Adventure|Children's|Comedy|Fantasy
1058,1073,Willy Wonka and the Chocolate Factory (1971),Adventure|Children's|Comedy|Fantasy
647,653,Dragonheart (1996),Action|Adventure|Fantasy
2124,2193,Willow (1988),Action|Adventure|Fantasy
3819,3889,Highlander: Endgame (2000),Action|Adventure|Fantasy
3084,3153,"7th Voyage of Sinbad, The (1958)",Action|Adventure|Fantasy
3807,3877,Supergirl (1984),Action|Adventure|Fantasy
3410,3479,Ladyhawke (1985),Adventure|Fantasy|Romance


In [34]:
data.sort_values('timestamp', inplace=True)

In [35]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [37]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.title] = row.genres

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index, row in tqdm_notebook(movies.iterrows()):


0it [00:00, ?it/s]

In [38]:
def recommend_for_user(user_id):
    current_user_id = user_id
    user_movies = data[data.user_id == current_user_id].title.unique()
    
    last_user_movie = user_movies[-1]
    
    movie_genres = title_genres[last_user_movie]
    
    movie_genres = change_string(movie_genres)

    predict = count_vect.transform([movie_genres])
    X_tfidf2 = tfidf_transformer.transform(predict)

    res = neigh.kneighbors(X_tfidf2, return_distance=True)
    
    movies_to_score = movies.iloc[res[1][0]].title.values

    scores = []
    titles = []

    for movie in movies_to_score:
        if movie in user_movies:
            continue

        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)
        
    
    best_indexes = np.argsort(scores)[-10:]
    for i in reversed(best_indexes):
        print(titles[i], scores[i])

In [39]:
data[data.user_id == 2.0].sort_values('rating')

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
153545,2,21,1,978299839,M,56,16,70072,Get Shorty (1995),Action|Comedy|Drama
180378,2,3893,1,978299535,M,56,16,70072,Nurse Betty (2000),Comedy|Thriller
126099,2,2427,2,978299913,M,56,16,70072,"Thin Red Line, The (1998)",Action|Drama|War
94412,2,3256,2,978299839,M,56,16,70072,Patriot Games (1992),Action|Thriller
175479,2,1968,2,978298881,M,56,16,70072,"Breakfast Club, The (1985)",Comedy|Drama
...,...,...,...,...,...,...,...,...,...,...
128353,2,1196,5,978298730,M,56,16,70072,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
60199,2,1357,5,978298709,M,56,16,70072,Shine (1996),Drama|Romance
180937,2,1247,5,978298652,M,56,16,70072,"Graduate, The (1967)",Drama|Romance
45034,2,1962,5,978298813,M,56,16,70072,Driving Miss Daisy (1989),Drama


In [40]:
recommend_for_user(2.0)

Star Trek: First Contact (1996) 3.7252869714190915
Abyss, The (1989) 3.678894092387741
Superman (1978) 3.629626591076966
Stargate (1994) 3.538806573818172
Star Trek: Generations (1994) 3.2363339450854185
Star Trek: The Motion Picture (1979) 3.222980545217195
Logan's Run (1976) 3.175641781324331
Rocketeer, The (1991) 3.073183774333324
Star Trek V: The Final Frontier (1989) 3.071189362286536
Six-String Samurai (1998) 3.0612418813274247
