In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
movies.head(7640)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [5]:
ratings.head(1250)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [6]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
len(movies[pd.isna(movies['movieId'])==1])

0

In [8]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))
movie_genres = [change_string(g) for g in movies.genres.values]

In [9]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [10]:
genres_dict = {}
for ind in tqdm_notebook(range(0,len(movies['movieId']))):
    genres_dict[movies['movieId'][ind]] = tfidf_transformer.transform(count_vect.transform([change_string(movies.genres[ind])]))

HBox(children=(IntProgress(value=0, max=9742), HTML(value='')))




In [11]:
tags_doc = {}
for uniq in tags.movieId.unique():
    tags_doc[uniq] = ''.join(' '.join(tags[tags['movieId'] == uniq]['tag']).lower())

In [12]:
tags_count = CountVectorizer()
X_train_tags_vec = tags_count.fit_transform(tags_doc.values())
tfidf_tags = TfidfTransformer()
X_tags_train_tfidf = tfidf_tags.fit_transform(X_train_tags_vec)

In [13]:
from sklearn.neighbors import NearestNeighbors
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from scipy.spatial import distance as ds

In [14]:
neigh = NearestNeighbors(n_neighbors=1000, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)
tag_neigh = NearestNeighbors(n_neighbors=1000, n_jobs=-1, metric='euclidean')
tag_neigh.fit(X_tags_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=-1, n_neighbors=1000, p=2, radius=1.0)

In [15]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [16]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)
trainset, testset = train_test_split(data, test_size=.15)
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1c9766cb3c8>

In [35]:
def hybr_rec_sys(user,film):
    #Мой вариант заключается в том, что бы найти ближайшие фильмы по жанрам, затем из них по тегам, 
    #сделать предсказание их оценки с помощью user-based, вывести 9 лучших и 1 рандомный(размывание реккомендаций)
    
    #
    #Находим ближайшие по жанрам к примеру 1000 фильмов
    #

    near_gnrs = neigh.kneighbors(tfidf_transformer.transform(
        count_vect.transform([change_string(movies[movies['title']==film].iloc[0,2])])
    ))[1][0]
    new_near_gnrs = []
    for movid in near_gnrs:
        if (movid!=movies[movies['title']==film].iloc[0,0]):
            new_near_gnrs.append(movid)   
            
    #
    #Теперь ищем ближайших по тегам
    #
    
    if (movies[movies['title']==film].iloc[0,0] in tags_doc.keys()):
        print('TRUE')
        near_tgs = tag_neigh.kneighbors(tfidf_tags.transform(
            tags_count.transform([tags_doc.get(movies[movies['title']==film].iloc[0,0])])))[1][0]
        #оставляем 50 "лучших фильмов" в обоих метриках. Если столько не набирается, берем лучшие по жанрам, 
        #так как тегов по этому фильму может вообще не быть
        dist = []
        for gnrs in tqdm_notebook(new_near_gnrs):
            ind = 0
            while ((near_tgs[ind]!=gnrs)and(ind<999)):
                ind+=1
            if (ind != 999):
                dist.append(ind)
        if (len(dist)<50):
            for ind in range(0,1000):
                if (dist.count(ind)==0):
                    dist.append(ind)
                if (len(dist)==50):
                    break;
        else:
            dist = dist[:50]
    else: 
        dist = np.arange(50)
        
        
    marks = []
    for ind in tqdm_notebook(dist):
        marks.append(algo.predict(uid = user,iid = movies.iloc[near_gnrs[ind],1]).est)
        
    marks = [i for i in np.argsort(marks)[::-1]]
    marks = marks[:9]
    rec_ind = []
    for i in marks:
        rec_ind.append(dist[i])
    ind = np.random.choice(movies['movieId'])
    while ((rec_ind.count(ind)!=0)and(ind==movies[movies['title']==film].iloc[0,0])):
        ind = np.random.choice(movies['movieId'])
    rec_ind.append(ind)
    rec_mov = {}
    for ind in rec_ind:
        rec_mov[ind] = movies[movies['movieId']==ind][movies.columns[1:]]
    return rec_mov

In [38]:
hybr_rec_sys(1,'Kung Fu Panda 2 (2011)')

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

{31:                      title genres
 30  Dangerous Minds (1995)  Drama, 28:                 title         genres
 27  Persuasion (1995)  Drama|Romance, 30:                                                 title       genres
 29  Shanghai Triad (Yao a yao yao dao waipo qiao) ...  Crime|Drama, 38:                   title           genres
 34  It Takes Two (1995)  Children|Comedy, 11:                              title                genres
 10  American President, The (1995)  Comedy|Drama|Romance, 1:               title                                       genres
 0  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy, 18:                 title  genres
 17  Four Rooms (1995)  Comedy, 25:                        title         genres
 24  Leaving Las Vegas (1995)  Drama|Romance, 37: Empty DataFrame
 Columns: [title, genres]
 Index: [], 3920:                                                   title  \
 2924  Faraway, So Close (In weiter Ferne, so nah!) (...   
 
                 

In [29]:
movies[movies['title']=='Kung Fu Panda 2 (2011)'].iloc[0,2]

'Action|Adventure|Animation|Children|Comedy|IMAX'