1. Использовать dataset MovieLens
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
    - TF-IDF на тегах и жанрах
    - Средние оценки (+ median, variance, etc.) пользователя и фильма
5. Оценить RMSE на тестовой выборке

In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import KNeighborsRegressor

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-','').split('|'))

## Построим рекомендацию по жанрам

In [4]:

movie_genres = [change_string(g) for g in movies.genres.values]
movie_genres[:5]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy']

In [5]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neig_reg = KNeighborsRegressor(n_neighbors=7, n_jobs=-1, metric='euclidean')
neig_reg.fit(X_train_counts, X_train_tfidf)

KNeighborsRegressor(metric='euclidean', n_jobs=-1, n_neighbors=7)

In [6]:
test = change_string('Adventure|Comedy|Fantasy|Crime')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neig_reg.kneighbors(X_tfidf2, return_distance=True)

In [7]:
predicted_movies = neig_reg.kneighbors(X_tfidf2, return_distance=True)
predicted_movies

(array([[0.88433496, 0.88433496, 0.88433496, 0.88433496, 0.88433496,
         0.88433496, 0.88433496]]),
 array([[6957, 4137, 4152, 3638, 6955, 4076, 6912]], dtype=int64))

In [8]:
movies.iloc[predicted_movies[1][0]]

Unnamed: 0,movieId,title,genres
6957,65685,Inkheart (2008),Adventure|Fantasy
4137,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
4152,5974,"Thief of Bagdad, The (1940)",Adventure|Fantasy
3638,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
6955,65651,Fire and Ice (2008),Adventure|Fantasy
4076,5816,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
6912,64167,Dinotopia (2002),Adventure|Fantasy


## Построим рекомендацию по тегам

In [9]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [10]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [11]:
movies_with_tags.tag.unique().shape

(1590,)

In [12]:
movies_with_tags.dropna(inplace=True)

In [13]:
movies_with_tags.title.unique().shape

(1572,)

In [14]:
tag_strings = []
movies2 = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-','') for s in group.tag.values]))
    movies2.append(movie)

  0%|          | 0/1572 [00:00<?, ?it/s]

In [15]:
# Посмотрим теги с 'timetravel'
retrieved_elements = list(filter(lambda x: 'timetravel' in x, tag_strings)) 
uniq_retrieved_elements = set(retrieved_elements)
for i in uniq_retrieved_elements:  
    print(i)

apocalypse ArnoldSchwarzenegger nuclearwar scifi Suspense timetravel robots Scifimasterpiece
timetravel timetravel BradPitt BruceWillis mindfuck Postapocalyptic postapocalyptic remake timetravel twistending
future lackofdevelopment lackofstory quickcuts scifi SimonPegg space spacetravel timetravel
alternatereality scifi sciencefiction timetravel
atmospheric cultfilm dreamlike hallucinatory psychological surreal timetravel jakegyllenhaal timetravel dreamlike hallucinatory mentalillness psychology socialcommentary teen thoughtprovoking atmospheric cerebral dreamlike enigmatic hallucinatory mentalillness mindfuck philosophy psychological quirky surreal thoughtprovoking weird
blackhole scifi timetravel ChristopherNolan scifi timetravel baddialogue philosophicalissues thoughtprovoking visuallyappealing
timetravel
Action artificialintelligence robots SciFi specialeffects tense timetravel robots


возьмем теги 'scifi timetravel'

In [16]:
count_vect_tags = CountVectorizer()
X_train_counts_tags = count_vect_tags.fit_transform(tag_strings)

tfidf_tags_transformer = TfidfTransformer()
X_train_tags_tfidf = tfidf_tags_transformer.fit_transform(X_train_counts_tags)

neig_reg_tags = KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan')
neig_reg_tags.fit(X_train_counts_tags, X_train_tags_tfidf)

KNeighborsRegressor(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [17]:
test_tags = 'scifi timetravel'

predict_tags = count_vect_tags.transform([test_tags])
X_tfidf2_tags = tfidf_tags_transformer.transform(predict_tags)

res_tags = neig_reg_tags.kneighbors(X_tfidf2_tags, return_distance=True)
res_tags

(array([[0.95663926, 0.95663926, 0.95663926, 0.95663926, 0.95663926,
         0.95663926, 0.95663926, 1.04336074, 1.41354867, 1.41354867]]),
 array([[ 152,  107,  106,  151, 1077, 1318, 1415,  446,  661,  822]],
       dtype=int64))

In [18]:
movies_list2 = [movies2[i] for i in res_tags[1][0]]
tag_list2 = [tag_strings[i] for i in res_tags[1][0]]

pd.DataFrame({'movies':movies_list2, 'tags': tag_list2})

Unnamed: 0,movies,tags
0,Bill & Ted's Excellent Adventure (1989),timetravel
1,Back to the Future Part II (1989),timetravel
2,Back to the Future (1985),timetravel
3,Bill & Ted's Bogus Journey (1991),timetravel
4,Primer (2004),timetravel
5,Stargate (1994),timetravel
6,Time Bandits (1981),timetravel
7,Final Fantasy: The Spirits Within (2001),scifi
8,In a Lonely Place (1950),L.A.
9,Magnolia (1999),L.A.


## Построим рекомендации тегах и рейтингах фильмов

### Добавим рейтинги к movies_with_tags

In [19]:
# Получим средний рейтинг фильма
mean_ratings = ratings.groupby('movieId').mean().drop(['userId','timestamp'],axis=1)
mean_ratings.rename(columns = {'rating':'mean_ratings'}, inplace = True)

median_ratings = ratings.groupby('movieId').median().drop(['userId','timestamp'],axis=1)
median_ratings.rename(columns = {'rating':'med_ratings'}, inplace = True)
# добавим колонку со средним рейтингом фильма
movies_with_ratings = movies_with_tags.join(mean_ratings, on='movieId')
movies_with_ratings = movies_with_ratings.join(median_ratings, on='movieId')


In [20]:
movies_with_ratings.dropna(inplace=True)

In [21]:
# функция расчета среднего, округленного до сотых
def Average(lst):
    return round(sum(lst) / len(lst), 2)

In [22]:
mean_rating = []
median_rating = []
movies3 = []
tag_strings2 = []

for movie, group in tqdm(movies_with_ratings.groupby('title')):
    tag_strings2.append(' '.join([str(s).replace(' ', '').replace('-','') for s in group.tag.values]))
    mean_rating.append(Average([s for s in group.mean_ratings.values]))
    median_rating.append(Average([s for s in group.med_ratings.values]))
    movies3.append(movie)

  0%|          | 0/1554 [00:00<?, ?it/s]

In [47]:
# Обучим модель
count_vect_tags_rate = CountVectorizer()
X_train_3 = count_vect_tags_rate.fit_transform(tag_strings2)

tfidf_3 = TfidfTransformer()
Y_train_3 = tfidf_3.fit_transform(X_train_3)

neig_3 = KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan')
neig_3.fit(X_train_3, Y_train_3)

KNeighborsRegressor(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [24]:
# Предскажем фильмы по жанрам scifi|timetravel
test_3 = change_string('scifi|timetravel')

X_test_3 = count_vect_tags_rate.transform([test_3])
Y_test_3 = tfidf_3.transform(X_test_3)

neighbors_3 = neig_3.kneighbors(Y_test_3, return_distance=True)
neighbors_3

(array([[0.95654817, 0.95654817, 0.95654817, 0.95654817, 0.95654817,
         0.95654817, 0.95654817, 1.04345183, 1.41354587, 1.41354587]]),
 array([[ 151, 1065,  106,  107, 1398,  152, 1302,  441,  813,  654]],
       dtype=int64))

In [25]:
# Выведем подобранные фильмы в порядке убывания среднего рейтинга
movies3_list = [movies3[i] for i in neighbors_3[1][0]]
mean_ratings_list = [mean_rating[i] for i in neighbors_3[1][0]]
median_ratings_list = [median_rating[i] for i in neighbors_3[1][0]]
tag_list = [tag_strings2[i] for i in neighbors_3[1][0]]
pd.DataFrame({'movies':movies3_list, 'tags': tag_list, 'mean_ratings':mean_ratings_list,  
              'median_ratings':median_ratings_list}).sort_values(by=['mean_ratings'], ascending=False)

Unnamed: 0,movies,tags,mean_ratings,median_ratings
2,Back to the Future (1985),timetravel,4.04,4.0
1,Primer (2004),timetravel,3.79,4.0
4,Time Bandits (1981),timetravel,3.73,4.0
8,Magnolia (1999),L.A.,3.71,4.0
5,Bill & Ted's Excellent Adventure (1989),timetravel,3.7,3.5
3,Back to the Future Part II (1989),timetravel,3.51,3.5
9,In a Lonely Place (1950),L.A.,3.5,3.5
6,Stargate (1994),timetravel,3.38,3.0
7,Final Fantasy: The Spirits Within (2001),scifi,3.35,3.5
0,Bill & Ted's Bogus Journey (1991),timetravel,3.08,3.0


## Оценить RMSE на тестовой выборке

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [28]:
# разделим на train, test
X_train, X_test, y_train, y_test = train_test_split(movies_with_tags.drop(['title'], axis=1), movies_with_tags.title, test_size=0.33, random_state=42)
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [30]:
train_tag_strings = []
train_movies = []

for movie, group in tqdm(train_df.groupby('title')):
    train_tag_strings.append(' '.join([str(s).replace(' ', '').replace('-','') for s in group.tag.values]))
    train_movies.append(movie)

  0%|          | 0/1217 [00:00<?, ?it/s]

In [37]:
# обучим train
count_vect_tags = CountVectorizer()
X_train_count4 = count_vect_tags.fit_transform(train_tag_strings)

tfidf_transformer = TfidfTransformer()
X_train_tfidf4 = tfidf_transformer.fit_transform(X_train_count4)

neig4 = KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan')
neig4.fit(X_train_count4, X_train_tfidf4)

KNeighborsRegressor(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [32]:
test_tag_strings = []
test_movies = []

for movie, group in tqdm(test_df.groupby('title')):
    test_tag_strings.append(' '.join([str(s).replace(' ', '').replace('-','') for s in group.tag.values]))
    test_movies.append(movie)


  0%|          | 0/733 [00:00<?, ?it/s]

In [46]:
# предскажем test
X_test_count4 = count_vect_tags.fit_transform(test_tag_strings)
X_test_tfidf4 = tfidf_transformer.fit_transform(X_test_count4)

pr = neig4.predict(X_test_count4)
#neig4.score(pr, X_test_tfidf4)

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 684 while Y.shape[1] == 1168

In [42]:
len(train_tag_strings)

1217

In [35]:

mean_squared_error(X_test_tfidf4, pr)