In [437]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

1. Использовать dataset MovieLens
1. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
- TF-IDF на тегах и жанрах
- Средние оценки (+ median, variance, etc.) пользователя и фильма
3. Оценить RMSE на тестовой выборке

In [438]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [439]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [440]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [441]:
movies['genres'] = [change_string(g) for g in movies.genres.values]

In [442]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [443]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [444]:
movies_df = movies.merge(tags, on = 'movieId')
movies_df.drop(['userId', 'timestamp'], axis = 1, inplace=True)
#movies_df = movies_df.merge(ratings)
movies_df

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,fun
3,2,Jumanji (1995),Adventure Children Fantasy,fantasy
4,2,Jumanji (1995),Adventure Children Fantasy,magic board game
...,...,...,...,...
3678,187595,Solo: A Star Wars Story (2018),Action Adventure Children SciFi,star wars
3679,193565,Gintama: The Movie (2010),Action Animation Comedy SciFi,anime
3680,193565,Gintama: The Movie (2010),Action Animation Comedy SciFi,comedy
3681,193565,Gintama: The Movie (2010),Action Animation Comedy SciFi,gintama


In [445]:
movies_tags = {}

for movie, group in tqdm(movies_df.groupby('title')):
    tags = set([str(s).replace(' ', '').replace('-', '') for s in group.tag.values])
    movies_tags[movie] = ' '.join([tag for tag in tags])

  0%|          | 0/1572 [00:00<?, ?it/s]

In [446]:
movies_tags_df = pd.DataFrame(movies_tags.items(), columns=['title', 'tags'])

In [447]:
movies_df = movies_df.merge(movies_tags_df).drop(['tag'], axis = 1).drop_duplicates()

In [448]:
movies_df

Unnamed: 0,movieId,title,genres,tags
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar fun
3,2,Jumanji (1995),Adventure Children Fantasy,game fantasy RobinWilliams magicboardgame
7,3,Grumpier Old Men (1995),Comedy Romance,moldy old
9,5,Father of the Bride Part II (1995),Comedy,pregnancy remake
11,7,Sabrina (1995),Comedy Romance,remake
...,...,...,...,...
3668,183611,Game Night (2018),Action Comedy Crime Horror,Comedy RachelMcAdams funny
3671,184471,Tomb Raider (2018),Action Adventure Fantasy,videogameadaptation AliciaVikander adventure
3674,187593,Deadpool 2 (2018),Action Comedy SciFi,RyanReynolds JoshBrolin sarcasm
3677,187595,Solo: A Star Wars Story (2018),Action Adventure Children SciFi,EmiliaClarke starwars


In [449]:
tfidf_vect_genres = TfidfVectorizer(use_idf=True)
genres_tfidf = tfidf_vect_genres.fit_transform(movies_df['genres'])

In [450]:
X_genres = pd.DataFrame(genres_tfidf.toarray(), columns=tfidf_vect_genres.get_feature_names(), index=movies_df.index)

In [451]:
tfidf_vect_tags = TfidfVectorizer(use_idf=True)
tags_tfidf = tfidf_vect_tags.fit_transform(movies_df['tags'])

In [452]:
X_tags = pd.DataFrame(tags_tfidf.toarray(), columns=tfidf_vect_tags.get_feature_names(), index=movies_df.index)

In [453]:
X_tags_genres = X_genres.merge(X_tags, left_index=True, right_index=True)

In [454]:
# отбрасываем столбцы, незначительно влияющие на датасет
X_tags_genres = X_tags_genres.loc[:, X_tags_genres.sum()> 10]

In [455]:
rate_stats = ratings[ratings.movieId.isin(movies_df.index)].groupby('movieId').agg({'rating': ['mean', 'var', 'median']})['rating']

In [456]:
X_tags_genres_stats = X_tags_genres.merge(rate_stats, left_index=True, right_index=True).fillna(0)

In [457]:
X_tags_genres_stats.head()

Unnamed: 0,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,fantasy_x,filmnoir_x,...,innetflixqueue,journalism,politics,religion,shakespeare,stephenking,superhero,mean,var,median
3,0.0,0.495081,0.0,0.635009,0.0,0.0,0.0,0.0,0.593008,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.259615,1.112651,3.0
7,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.185185,0.955625,3.0
9,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.125,0.95,3.0
11,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.671429,0.810766,4.0
12,0.0,0.0,0.0,0.0,0.579624,0.0,0.0,0.433334,0.0,0.0,...,0.0,0.0,0.632559,0.0,0.0,0.0,0.0,2.421053,1.562865,3.0


In [458]:
tags_genres_stats_rate_df = X_tags_genres_stats.merge(ratings, how = 'inner', left_on=X_tags_genres_stats.index, right_on='movieId').drop(['timestamp'], axis = 1)

In [459]:
user_id = 100
for_user_df = tags_genres_stats_rate_df[tags_genres_stats_rate_df['userId'] == 100]

In [460]:
y = for_user_df['rating']
X = for_user_df.drop(['userId', 'movieId', 'rating'], axis=1)

In [461]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

In [462]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [463]:
lr.fit(X_train, y_train)

LinearRegression()

In [464]:
lr.predict(X_test)

array([3.65648292, 4.30197135, 4.16512151, 5.22602091, 3.32533671,
       3.10033092, 4.94876289, 5.26543408, 4.45610269, 3.98895337,
       2.89967431, 4.63289141, 3.279501  ])

In [466]:
y_test

17338    3.5
18640    4.0
15205    3.5
238      4.5
15808    5.0
17670    4.5
7552     4.0
8534     4.0
13928    4.0
13136    4.0
12294    2.0
12695    4.5
22950    4.5
Name: rating, dtype: float64

In [467]:
lr.score(X_test, y_test)

-0.673085410382239