1. Использовать dataset MovieLens
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
    - TF-IDF на тегах и жанрах
    - Средние оценки (+ median, variance, etc.) пользователя и фильма
5. Оценить RMSE на тестовой выборке

In [200]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import KNeighborsRegressor

In [201]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [202]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-','').split('|'))

## Построим рекомендацию по жанрам

In [203]:
# найдем дублирующиеся строки с фильмами
dup = movies[movies.title.duplicated()==True]['title'].to_list()
movies[movies.title.isin(dup)].sort_values(by='title')

Unnamed: 0,movieId,title,genres
4169,6003,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Thriller
9106,144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller
650,838,Emma (1996),Comedy|Drama|Romance
5601,26958,Emma (1996),Romance
5854,32600,Eros (2004),Drama
9135,147002,Eros (2004),Drama|Romance
2141,2851,Saturn 3 (1980),Adventure|Sci-Fi|Thriller
9468,168358,Saturn 3 (1980),Sci-Fi|Thriller
5931,34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller
6932,64997,War of the Worlds (2005),Action|Sci-Fi


In [204]:
# удалим менее информативные строки с жанрами для дублирующихся фильмов
to_dtop = [4169, 5601, 5854, 9468, 6932]
movies.drop(to_dtop, inplace=True)

In [205]:
# добавим тэги
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

movies_with_tags = pd.DataFrame(list(zip(movies2,tag_strings)), columns=['title', 'tag'])
movies_with_genres_tags = movies.join(movies_with_tags.set_index('title'), on='title')
movies_with_genres_tags['genres'] = movies_with_genres_tags['genres'].apply(change_string)
movies_with_genres_tags.head()

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,
1,2,Jumanji (1995),Adventure Children Fantasy,sofiacoppola 1970s suicide
2,3,Grumpier Old Men (1995),Comedy Romance,Shakespearesortof
3,4,Waiting to Exhale (1995),Comedy Drama Romance,
4,5,Father of the Bride Part II (1995),Comedy,hallucinatory mentalillness mindfuck paranoid ...


In [206]:
# Получим средний и медианный рейтинг фильма
mean_ratings = ratings.groupby('movieId').mean().drop(['userId','timestamp'],axis=1)
mean_ratings.rename(columns = {'rating':'mean_ratings'}, inplace = True)

median_ratings = ratings.groupby('movieId').median().drop(['userId','timestamp'],axis=1)
median_ratings.rename(columns = {'rating':'med_ratings'}, inplace = True)

variance_ratings = ratings.groupby('movieId').var().drop(['userId','timestamp'],axis=1)
variance_ratings.rename(columns = {'rating':'var_ratings'}, inplace = True)

# добавим колонку со средним и медианным рейтингом фильма
movies_with_genres_tags_ratings = movies_with_genres_tags.join(mean_ratings, on='movieId')
movies_with_genres_tags_ratings = movies_with_genres_tags_ratings.join(median_ratings, on='movieId')
movies_with_genres_tags_ratings = movies_with_genres_tags_ratings.join(variance_ratings, on='movieId')

In [207]:
movies_with_genres_tags_ratings.head()

Unnamed: 0,movieId,title,genres,tag,mean_ratings,med_ratings,var_ratings
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,,3.92093,4.0,0.69699
1,2,Jumanji (1995),Adventure Children Fantasy,sofiacoppola 1970s suicide,3.431818,3.5,0.777419
2,3,Grumpier Old Men (1995),Comedy Romance,Shakespearesortof,3.259615,3.0,1.112651
3,4,Waiting to Exhale (1995),Comedy Drama Romance,,2.357143,3.0,0.72619
4,5,Father of the Bride Part II (1995),Comedy,hallucinatory mentalillness mindfuck paranoid ...,3.071429,3.0,0.822917


## Построим рекомендацию по жанрам

In [208]:
# получим список жанров
genres_list = movies_with_genres_tags_ratings.genres.to_list()

# обучим модель
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(genres_list)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neig_reg = KNeighborsRegressor(n_neighbors=7, n_jobs=-1, metric='euclidean')
neig_reg.fit(X_train_counts, X_train_tfidf)

KNeighborsRegressor(metric='euclidean', n_jobs=-1, n_neighbors=7)

In [209]:
# проверим на тесте
test = change_string('Adventure|Comedy|Fantasy|Crime')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

predicted_movies = neig_reg.kneighbors(X_tfidf2, return_distance=True)
predicted_movies

(array([[0.88442865, 0.88442865, 0.88442865, 0.88442865, 0.88442865,
         0.88442865, 0.88442865]]),
 array([[1598, 8613, 6953, 4076, 7257, 9214, 4152]], dtype=int64))

In [212]:
# выведем список фильмов в порядке убавания рейтинга
movies_with_genres_tags_ratings.iloc[predicted_movies[1][0]].sort_values(by=['mean_ratings'], ascending=False)

Unnamed: 0,movieId,title,genres,tag,mean_ratings,med_ratings,var_ratings
4076,5816,Harry Potter and the Chamber of Secrets (2002),Adventure Fantasy,AdamSandler,3.598039,3.5,0.85663
1598,2140,"Dark Crystal, The (1982)",Adventure Fantasy,,3.544118,3.5,0.672237
6957,65685,Inkheart (2008),Adventure Fantasy,,3.5,3.5,0.25
8617,118696,The Hobbit: The Battle of the Five Armies (2014),Adventure Fantasy,beautifulvisuals Cerebral cinematography goodc...,3.416667,3.25,1.183824
4152,5974,"Thief of Bagdad, The (1940)",Adventure Fantasy,,3.0,3.0,
7261,74530,Percy Jackson & the Olympians: The Lightning T...,Adventure Fantasy,,2.357143,2.5,0.892857
9218,152063,Gods of Egypt (2016),Adventure Fantasy,,0.5,0.5,


## Построим рекомендацию по тегам

In [156]:
# проверим кол-во уникальных тегов
movies_with_genres_tags_ratings.tag.unique().shape

(1037,)

In [216]:
# удалим фильмы с отсутствующими тегами
movies_with_genres_tags_ratings2 = movies_with_genres_tags_ratings.dropna()

In [220]:
# получим список тегов
tag_strings = movies_with_genres_tags_ratings2.tag.to_list()

# обучим модель
count_vect_tags = CountVectorizer()
X_train_counts_tags = count_vect_tags.fit_transform(tag_strings)

tfidf_tags_transformer = TfidfTransformer()
X_train_tags_tfidf = tfidf_tags_transformer.fit_transform(X_train_counts_tags)

neig_reg_tags = KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan')
neig_reg_tags.fit(X_train_counts_tags, X_train_tags_tfidf)

KNeighborsRegressor(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [221]:
# возьмем теги 'scifi timetravel fun'
test_tags = 'scifi timetravel fun'

predict_tags = count_vect_tags.transform([test_tags])
X_tfidf2_tags = tfidf_tags_transformer.transform(predict_tags)

res_tags = neig_reg_tags.kneighbors(X_tfidf2_tags, return_distance=True)
res_tags

(array([[1.44315308, 1.62888293, 1.62888293, 1.62888293, 1.62888293,
         1.62888293, 1.62888293, 1.62888293, 1.72628919, 2.72628919]]),
 array([[1197,  159,  345,  394,   54,   93,   82,  284,   55,  713]],
       dtype=int64))

In [222]:
# выведем список фильмов в порядке убавания рейтинга
movies_with_genres_tags_ratings2.iloc[res_tags[1][0]].sort_values(by=['mean_ratings'], ascending=False)

Unnamed: 0,movieId,title,genres,tag,mean_ratings,med_ratings,var_ratings
966,1267,"Manchurian Candidate, The (1962)",Crime Thriller War,timetravel,4.25,4.0,0.599138
691,909,"Apartment, The (1960)",Comedy Drama Romance,timetravel,3.962963,4.0,0.921652
1438,1961,Rain Man (1988),Drama,timetravel,3.807292,4.0,0.612473
213,249,Immortal Beloved (1994),Drama Romance,L.A.,3.722222,4.0,0.800654
3398,4623,Major League (1989),Comedy,LorettaLynn,3.575,3.75,0.586184
383,440,Dave (1993),Comedy Romance,timetravel,3.546053,4.0,0.447851
315,357,Four Weddings and a Funeral (1994),Comedy Romance,timetravel,3.519417,4.0,1.239815
212,248,Houseguest (1994),Comedy,timetravel,3.176471,3.0,0.998162
1164,1544,"Lost World: Jurassic Park, The (1997)",Action Adventure SciFi Thriller,timetravel,2.753731,3.0,0.987675
8650,120635,Taken 3 (2015),Action Crime Thriller,fun,2.7,2.0,3.7


## Оценить RMSE на тестовой выборке

In [223]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [236]:
# удалим фильмы с отсуствующими рейтингами
print(movies_with_genres_tags_ratings.mean_ratings.isna().any())
movies_with_genres_tags_ratings3 = movies_with_genres_tags_ratings.dropna()

True


In [237]:
# разделим на train, test
X_train, X_test, y_train, y_test = train_test_split(movies_with_genres_tags_ratings3.drop(['mean_ratings'], axis=1), 
                                                    movies_with_genres_tags_ratings3.mean_ratings, test_size=0.33, 
                                                    random_state=42)
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [238]:
# обучим train

train_genres_strings = train_df.genres.to_list()
count_vect_tags = CountVectorizer()
X_train_count4 = count_vect_tags.fit_transform(train_genres_strings)

tfidf_transformer = TfidfTransformer()
X_train_tfidf4 = tfidf_transformer.fit_transform(X_train_count4)

neig4 = KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan')
neig4.fit(X_train_count4, train_df.mean_ratings)

KNeighborsRegressor(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [239]:
# предскажем test
test_genres_strings = test_df.genres.to_list()

X_test_count4 = count_vect_tags.transform(test_genres_strings)
X_test_tfidf4 = tfidf_transformer.transform(X_test_count4)

predicted = neig4.predict(X_test_count4)
#neig4.score(pr, X_test_tfidf4)


In [242]:
# получим RMSE на тестовой выборке
mean_squared_error(test_df.mean_ratings, predicted)

0.21219183365297417