In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

1. Использовать dataset MovieLens
1. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
- TF-IDF на тегах и жанрах
- Средние оценки (+ median, variance, etc.) пользователя и фильма
3. Оценить RMSE на тестовой выборке

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [5]:
movies['genres'] = [change_string(g) for g in movies.genres.values]

In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [8]:
movies_df = movies.merge(tags, on = 'movieId')
movies_df.drop(['userId', 'timestamp'], axis = 1, inplace=True)
#movies_df = movies_df.merge(ratings)
movies_df

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,fun
3,2,Jumanji (1995),Adventure Children Fantasy,fantasy
4,2,Jumanji (1995),Adventure Children Fantasy,magic board game
...,...,...,...,...
3678,187595,Solo: A Star Wars Story (2018),Action Adventure Children SciFi,star wars
3679,193565,Gintama: The Movie (2010),Action Animation Comedy SciFi,anime
3680,193565,Gintama: The Movie (2010),Action Animation Comedy SciFi,comedy
3681,193565,Gintama: The Movie (2010),Action Animation Comedy SciFi,gintama


In [26]:
movies_tags = {}

for movie, group in tqdm(movies_df.groupby('title')):
    tags = set([str(s).replace(' ', '').replace('-', '') for s in group.tag.values])
    movies_tags[movie] 

  0%|          | 0/1554 [00:00<?, ?it/s]

In [27]:
movies_tags

{'(500) Days of Summer (2009)': {'Funny',
  'ZooeyDeschanel',
  'artistic',
  'humorous',
  'inspiring',
  'intelligent',
  'quirky',
  'romance'},
 '...And Justice for All (1979)': {'lawyers'},
 '10 Cloverfield Lane (2016)': {'creepy', 'suspense'},
 '10 Things I Hate About You (1999)': {'Shakespearesortof'},
 '101 Dalmatians (1996)': {'dogs', 'remake'},
 '101 Dalmatians (One Hundred and One Dalmatians) (1961)': {'Disney'},
 '11\'09"01 - September 11 (2002)': {'terrorism'},
 '12 Angry Men (1957)': {'Motivational',
  'claustrophobic',
  'confrontational',
  'court',
  'earnest',
  'gooddialogue',
  'greatscreenplay',
  'gritty',
  'thoughtprovoking'},
 '127 Hours (2010)': {'stranded'},
 '13 Going on 30 (2004)': {'MarkRuffalo'},
 '2001: A Space Odyssey (1968)': {'ArthurC.Clarke',
  'Dull',
  'Hal',
  'Oscar(BestEffectsVisualEffects)',
  'StanleyKubrick',
  'aliens',
  'apes',
  'artificialintelligence',
  'atmospheric',
  'cinematography',
  'classic',
  'computer',
  'confusingending',


In [10]:
movies_df = movies_df.merge(ratings, on = 'movieId')
movies_df.drop(['userId', 'timestamp'], axis = 1, inplace=True)
#movies_df = movies_df.merge(ratings)
movies_df

Unnamed: 0,movieId,title,genres,tag,rating
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar,4.0
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar,4.0
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar,4.5
3,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar,2.5
4,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar,4.5
...,...,...,...,...,...
233208,187595,Solo: A Star Wars Story (2018),Action Adventure Children SciFi,star wars,5.0
233209,193565,Gintama: The Movie (2010),Action Animation Comedy SciFi,anime,3.5
233210,193565,Gintama: The Movie (2010),Action Animation Comedy SciFi,comedy,3.5
233211,193565,Gintama: The Movie (2010),Action Animation Comedy SciFi,gintama,3.5


In [50]:
movies.shape

(9742, 3)

In [51]:
movies_df.shape

(233213, 5)

In [8]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [10]:
movies['genres'] = [change_string(g) for g in movies.genres.values]

In [16]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movies['genres'])

In [19]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)