**Домашнее задание по теме «Рекомендации на основе содержания»**

1. Использовать датасет MovieLens.

In [64]:
import os

import pandas as pd
import numpy as np
from math import sqrt

from tqdm import tqdm_notebook

from collections import Counter
from datetime import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsRegressor, NearestNeighbors
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv( 'ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
#объединяем таблицы в одну

In [8]:
df_all = ratings.merge(movies, how='left', on='movieId').merge(tags, how='left', on=['userId', 'movieId'])
df_all.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,title,genres,tag,timestamp_y
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,,
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,,
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,,
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,,


In [9]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102677 entries, 0 to 102676
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   userId       102677 non-null  int64  
 1   movieId      102677 non-null  int64  
 2   rating       102677 non-null  float64
 3   timestamp_x  102677 non-null  int64  
 4   title        102677 non-null  object 
 5   genres       102677 non-null  object 
 6   tag          3476 non-null    object 
 7   timestamp_y  3476 non-null    float64
dtypes: float64(2), int64(3), object(3)
memory usage: 7.1+ MB


In [10]:
# удаляем пока что ненужные столбцы

In [11]:
df_all.drop(['timestamp_y', 'timestamp_x'], axis=1, inplace = True)
df_all.head()

Unnamed: 0,userId,movieId,rating,title,genres,tag
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,
1,1,3,4.0,Grumpier Old Men (1995),Comedy|Romance,
2,1,6,4.0,Heat (1995),Action|Crime|Thriller,
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,
4,1,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,


In [12]:
# заменяем пустые значения в столбце tag на none

In [13]:
df_all['tag'] = df_all['tag'].fillna('none')
df_all.head()

Unnamed: 0,userId,movieId,rating,title,genres,tag
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,none
1,1,3,4.0,Grumpier Old Men (1995),Comedy|Romance,none
2,1,6,4.0,Heat (1995),Action|Crime|Thriller,none
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,none
4,1,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,none


In [14]:
# проверяем, есть ли пустые значения в столбце с жанрами

In [15]:
df_all['genres'].isnull().sum()

0

2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
*   TF-IDF на тегах и жанрах;
*   средние оценки (+ median, variance и т. д.) пользователя и фильма.

In [16]:
# создадим словарь для подсчёта мер среднего

In [17]:
agg_func_rating = {
    'rating': ['count','mean', 'median', 'var']
}

In [18]:
# посчитаем меры среднего для фильмов

In [19]:
movies_rating = ratings.groupby('movieId').agg(agg_func_rating).reset_index()
movies_rating

Unnamed: 0_level_0,movieId,rating,rating,rating,rating
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,median,var
0,1,215,3.920930,4.0,0.696990
1,2,110,3.431818,3.5,0.777419
2,3,52,3.259615,3.0,1.112651
3,4,7,2.357143,3.0,0.726190
4,5,49,3.071429,3.0,0.822917
...,...,...,...,...,...
9719,193581,1,4.000000,4.0,
9720,193583,1,3.500000,3.5,
9721,193585,1,3.500000,3.5,
9722,193587,1,3.500000,3.5,


In [20]:
# переименовываем столбцы, чтобы не запутаться при объединении таблиц

In [21]:
movies_rating.rename(columns = {'count' : 'count_movie_rating', 'mean' : 'mean_movie_rating',
                     'median' : 'median_movie_rating', 'var' : 'var_movie_rating'}, inplace = True)
movies_rating

Unnamed: 0_level_0,movieId,rating,rating,rating,rating
Unnamed: 0_level_1,Unnamed: 1_level_1,count_movie_rating,mean_movie_rating,median_movie_rating,var_movie_rating
0,1,215,3.920930,4.0,0.696990
1,2,110,3.431818,3.5,0.777419
2,3,52,3.259615,3.0,1.112651
3,4,7,2.357143,3.0,0.726190
4,5,49,3.071429,3.0,0.822917
...,...,...,...,...,...
9719,193581,1,4.000000,4.0,
9720,193583,1,3.500000,3.5,
9721,193585,1,3.500000,3.5,
9722,193587,1,3.500000,3.5,


In [22]:
# удаляем верхний уровень в названии столбцов

In [23]:
movies_rating.columns = [tup[1] if tup[1] else tup[0] for tup in movies_rating.columns]
movies_rating

Unnamed: 0,movieId,count_movie_rating,mean_movie_rating,median_movie_rating,var_movie_rating
0,1,215,3.920930,4.0,0.696990
1,2,110,3.431818,3.5,0.777419
2,3,52,3.259615,3.0,1.112651
3,4,7,2.357143,3.0,0.726190
4,5,49,3.071429,3.0,0.822917
...,...,...,...,...,...
9719,193581,1,4.000000,4.0,
9720,193583,1,3.500000,3.5,
9721,193585,1,3.500000,3.5,
9722,193587,1,3.500000,3.5,


In [24]:
# посчитаем меры среднего для пользователей

In [25]:
users_rating = ratings.groupby('userId').agg(agg_func_rating).reset_index()
users_rating

Unnamed: 0_level_0,userId,rating,rating,rating,rating
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,median,var
0,1,232,4.366379,5.0,0.640077
1,2,29,3.948276,4.0,0.649015
2,3,39,2.435897,0.5,4.370783
3,4,216,3.555556,4.0,1.727132
4,5,44,3.636364,4.0,0.980973
...,...,...,...,...,...
605,606,1115,3.657399,4.0,0.524351
606,607,187,3.786096,4.0,0.932494
607,608,831,3.134176,3.0,1.164807
608,609,37,3.270270,3.0,0.202703


In [26]:
users_rating.rename(columns = {'count' : 'count_user_rating', 'mean' : 'mean_user_rating',
                     'median' : 'median_user_rating', 'var' : 'var_user_rating'}, inplace = True)
users_rating

Unnamed: 0_level_0,userId,rating,rating,rating,rating
Unnamed: 0_level_1,Unnamed: 1_level_1,count_user_rating,mean_user_rating,median_user_rating,var_user_rating
0,1,232,4.366379,5.0,0.640077
1,2,29,3.948276,4.0,0.649015
2,3,39,2.435897,0.5,4.370783
3,4,216,3.555556,4.0,1.727132
4,5,44,3.636364,4.0,0.980973
...,...,...,...,...,...
605,606,1115,3.657399,4.0,0.524351
606,607,187,3.786096,4.0,0.932494
607,608,831,3.134176,3.0,1.164807
608,609,37,3.270270,3.0,0.202703


In [27]:
users_rating.columns = [tup[1] if tup[1] else tup[0] for tup in users_rating.columns]
users_rating

Unnamed: 0,userId,count_user_rating,mean_user_rating,median_user_rating,var_user_rating
0,1,232,4.366379,5.0,0.640077
1,2,29,3.948276,4.0,0.649015
2,3,39,2.435897,0.5,4.370783
3,4,216,3.555556,4.0,1.727132
4,5,44,3.636364,4.0,0.980973
...,...,...,...,...,...
605,606,1115,3.657399,4.0,0.524351
606,607,187,3.786096,4.0,0.932494
607,608,831,3.134176,3.0,1.164807
608,609,37,3.270270,3.0,0.202703


In [28]:
# объединяем всё в одну таблицу

In [29]:
df_all = df_all.merge(users_rating, how='left', on=['userId']).merge(movies_rating, how='left', on=['movieId'])
df_all

Unnamed: 0,userId,movieId,rating,title,genres,tag,count_user_rating,mean_user_rating,median_user_rating,var_user_rating,count_movie_rating,mean_movie_rating,median_movie_rating,var_movie_rating
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,none,232,4.366379,5.0,0.640077,215,3.920930,4.0,0.696990
1,1,3,4.0,Grumpier Old Men (1995),Comedy|Romance,none,232,4.366379,5.0,0.640077,52,3.259615,3.0,1.112651
2,1,6,4.0,Heat (1995),Action|Crime|Thriller,none,232,4.366379,5.0,0.640077,102,3.946078,4.0,0.667856
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,none,232,4.366379,5.0,0.640077,203,3.975369,4.0,0.850875
4,1,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,none,232,4.366379,5.0,0.640077,204,4.237745,4.5,0.641475
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102672,610,166534,4.0,Split (2017),Drama|Horror|Thriller,none,1302,3.688556,3.5,0.735173,6,3.333333,4.0,2.466667
102673,610,168248,5.0,John Wick: Chapter Two (2017),Action|Crime|Thriller,Heroic Bloodshed,1302,3.688556,3.5,0.735173,7,4.142857,4.0,0.559524
102674,610,168250,5.0,Get Out (2017),Horror,none,1302,3.688556,3.5,0.735173,15,3.633333,4.0,0.945238
102675,610,168252,5.0,Logan (2017),Action|Sci-Fi,none,1302,3.688556,3.5,0.735173,25,4.280000,4.5,0.418333


In [30]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102677 entries, 0 to 102676
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   userId               102677 non-null  int64  
 1   movieId              102677 non-null  int64  
 2   rating               102677 non-null  float64
 3   title                102677 non-null  object 
 4   genres               102677 non-null  object 
 5   tag                  102677 non-null  object 
 6   count_user_rating    102677 non-null  int64  
 7   mean_user_rating     102677 non-null  float64
 8   median_user_rating   102677 non-null  float64
 9   var_user_rating      102677 non-null  float64
 10  count_movie_rating   102677 non-null  int64  
 11  mean_movie_rating    102677 non-null  float64
 12  median_movie_rating  102677 non-null  float64
 13  var_movie_rating     99172 non-null   float64
dtypes: float64(7), int64(4), object(3)
memory usage: 11.8+ MB


In [31]:
# заполняем пустые значения дисперсии

In [32]:
df_all['var_movie_rating'].fillna(0, inplace = True)

In [33]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102677 entries, 0 to 102676
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   userId               102677 non-null  int64  
 1   movieId              102677 non-null  int64  
 2   rating               102677 non-null  float64
 3   title                102677 non-null  object 
 4   genres               102677 non-null  object 
 5   tag                  102677 non-null  object 
 6   count_user_rating    102677 non-null  int64  
 7   mean_user_rating     102677 non-null  float64
 8   median_user_rating   102677 non-null  float64
 9   var_user_rating      102677 non-null  float64
 10  count_movie_rating   102677 non-null  int64  
 11  mean_movie_rating    102677 non-null  float64
 12  median_movie_rating  102677 non-null  float64
 13  var_movie_rating     102677 non-null  float64
dtypes: float64(7), int64(4), object(3)
memory usage: 11.8+ MB


In [34]:
# объединим для каждого фильма все теги

In [35]:
tags['tag'] = tags['tag'] + ' '
tag_full = tags.groupby(['userId', 'movieId']).apply(lambda x: x.sum())
tag_full = tag_full['tag']
tag_full.head(3)

userId  movieId
2       60756           funny Highly quotable will ferrell 
        89774                   Boxing story MMA Tom Hardy 
        106782     drugs Leonardo DiCaprio Martin Scorsese 
Name: tag, dtype: object

In [36]:
tag_full.shape

(1775,)

In [37]:
df_all = df_all.merge(tag_full, how='left', left_on=['userId', 'movieId'], right_index=True)
df_all.rename(columns = {'tag_x' : 'tag', 'tag_y' : 'tag_full'}, inplace = True)
df_all['tag_full'].fillna('none', inplace = True)
df_all

Unnamed: 0,userId,movieId,rating,title,genres,tag,count_user_rating,mean_user_rating,median_user_rating,var_user_rating,count_movie_rating,mean_movie_rating,median_movie_rating,var_movie_rating,tag_full
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,none,232,4.366379,5.0,0.640077,215,3.920930,4.0,0.696990,none
1,1,3,4.0,Grumpier Old Men (1995),Comedy|Romance,none,232,4.366379,5.0,0.640077,52,3.259615,3.0,1.112651,none
2,1,6,4.0,Heat (1995),Action|Crime|Thriller,none,232,4.366379,5.0,0.640077,102,3.946078,4.0,0.667856,none
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,none,232,4.366379,5.0,0.640077,203,3.975369,4.0,0.850875,none
4,1,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,none,232,4.366379,5.0,0.640077,204,4.237745,4.5,0.641475,none
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102672,610,166534,4.0,Split (2017),Drama|Horror|Thriller,none,1302,3.688556,3.5,0.735173,6,3.333333,4.0,2.466667,none
102673,610,168248,5.0,John Wick: Chapter Two (2017),Action|Crime|Thriller,Heroic Bloodshed,1302,3.688556,3.5,0.735173,7,4.142857,4.0,0.559524,Heroic Bloodshed
102674,610,168250,5.0,Get Out (2017),Horror,none,1302,3.688556,3.5,0.735173,15,3.633333,4.0,0.945238,none
102675,610,168252,5.0,Logan (2017),Action|Sci-Fi,none,1302,3.688556,3.5,0.735173,25,4.280000,4.5,0.418333,none


In [38]:
df_all['genres'] = df_all['genres'].apply(lambda x: x.replace('|', ' '))
df_all.head(3)

Unnamed: 0,userId,movieId,rating,title,genres,tag,count_user_rating,mean_user_rating,median_user_rating,var_user_rating,count_movie_rating,mean_movie_rating,median_movie_rating,var_movie_rating,tag_full
0,1,1,4.0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,none,232,4.366379,5.0,0.640077,215,3.92093,4.0,0.69699,none
1,1,3,4.0,Grumpier Old Men (1995),Comedy Romance,none,232,4.366379,5.0,0.640077,52,3.259615,3.0,1.112651,none
2,1,6,4.0,Heat (1995),Action Crime Thriller,none,232,4.366379,5.0,0.640077,102,3.946078,4.0,0.667856,none


In [39]:
X = df_all.iloc[:, 4:]
X

Unnamed: 0,genres,tag,count_user_rating,mean_user_rating,median_user_rating,var_user_rating,count_movie_rating,mean_movie_rating,median_movie_rating,var_movie_rating,tag_full
0,Adventure Animation Children Comedy Fantasy,none,232,4.366379,5.0,0.640077,215,3.920930,4.0,0.696990,none
1,Comedy Romance,none,232,4.366379,5.0,0.640077,52,3.259615,3.0,1.112651,none
2,Action Crime Thriller,none,232,4.366379,5.0,0.640077,102,3.946078,4.0,0.667856,none
3,Mystery Thriller,none,232,4.366379,5.0,0.640077,203,3.975369,4.0,0.850875,none
4,Crime Mystery Thriller,none,232,4.366379,5.0,0.640077,204,4.237745,4.5,0.641475,none
...,...,...,...,...,...,...,...,...,...,...,...
102672,Drama Horror Thriller,none,1302,3.688556,3.5,0.735173,6,3.333333,4.0,2.466667,none
102673,Action Crime Thriller,Heroic Bloodshed,1302,3.688556,3.5,0.735173,7,4.142857,4.0,0.559524,Heroic Bloodshed
102674,Horror,none,1302,3.688556,3.5,0.735173,15,3.633333,4.0,0.945238,none
102675,Action Sci-Fi,none,1302,3.688556,3.5,0.735173,25,4.280000,4.5,0.418333,none


In [40]:
#  добавим к признакам матрицы tfidf по тегам и жанрам

In [41]:
tfidf = TfidfVectorizer()

In [42]:
tags_tfidf = pd.DataFrame(tfidf.fit_transform(X['tag_full']).toarray(),
                          columns=tfidf.get_feature_names_out()).reset_index(drop=True)
tags_tfidf

Unnamed: 0,06,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001,250,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
genres_tfidf = pd.DataFrame(tfidf.fit_transform(X['genres']).toarray(),
                            columns=tfidf.get_feature_names_out()).reset_index(drop=True)
genres_tfidf

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,0.000000,0.363997,0.548865,0.509129,0.292063,0.000000,0.0,0.000000,0.470894,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,0.582835,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.812591,0.000000,0.000000,0.0,0.0
2,0.516958,0.000000,0.000000,0.000000,0.000000,0.656650,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.549150,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.837071,0.0,0.0,0.000000,0.000000,0.547094,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.547452,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.700492,0.0,0.0,0.000000,0.000000,0.457828,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102672,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.397828,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.495581,0.0,0.0
102673,0.516958,0.000000,0.000000,0.000000,0.000000,0.656650,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.549150,0.0,0.0
102674,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
102675,0.489982,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.616408,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.616408,0.000000,0.0,0.0


In [44]:
X = pd.concat((tags_tfidf, genres_tfidf, X.reset_index(drop=True)), axis = 1)

print(X.shape)

(102677, 1744)


In [45]:
# удалим лишние столбцы

In [46]:
X.drop(columns = ['genres', 'tag', 'tag_full'], inplace = True)

In [47]:
y = df_all['rating']
y.value_counts()

4.0    27208
3.0    20189
5.0    13921
3.5    13424
4.5     8811
2.0     7563
2.5     5568
1.0     2819
1.5     1801
0.5     1373
Name: rating, dtype: int64

In [48]:
# делим выборку на тренировочную и тестовую

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True,
                                                    test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

(82141, 1739)
(20536, 1739)


In [50]:
# создадим модель

In [51]:
model = LinearRegression()

model.fit(X_train, y_train)

In [52]:
predictions = model.predict(X_test)
predictions

array([4.        , 4.14327855, 3.43004756, ..., 4.55638893, 3.36742941,
       3.66320474])

In [53]:
df_compare = pd.DataFrame ([y_test.values, predictions]).T
df_compare.columns = ['y_test', 'predictions']
df_compare

Unnamed: 0,y_test,predictions
0,4.0,4.000000
1,3.5,4.143279
2,3.5,3.430048
3,3.0,3.614849
4,4.0,3.628731
...,...,...
20531,4.5,3.859782
20532,4.0,3.863917
20533,5.0,4.556389
20534,2.5,3.367429


3. Оценить RMSE на тестовой выборке.

In [54]:
RMSE_LR = mean_squared_error(y_test.values, predictions, squared=False)
RMSE_LR

0.7959009459972604

In [55]:
# попробуем построить модель на основе метода k-ближайших соседей

In [72]:
kNN = make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=7))
kNN.fit(X_train, y_train)

In [73]:
pred = kNN.predict(X_test)

In [74]:
df_compare = pd.DataFrame ([y_test.values, pred]).T
df_compare.columns = ['y_test', 'predictions']
df_compare

Unnamed: 0,y_test,predictions
0,4.0,3.357143
1,3.5,4.285714
2,3.5,3.571429
3,3.0,3.500000
4,4.0,3.928571
...,...,...
20531,4.5,3.714286
20532,4.0,3.714286
20533,5.0,4.285714
20534,2.5,2.928571


In [75]:
RMSE_kNN = mean_squared_error(y_test.values, pred, squared=False)
RMSE_kNN

0.8560665441720582

In [60]:
# попробуем построить рекомендации по жанрам

In [61]:
def recommend_10_movies(genres, data):
  '''
  Функция предобрабатывает список жанров и
  рекомендует 10 фильмов похожих по жанрам,
  на основе метода к-ближайших соседей
  genres - список жанров
  data - таблица с исходными данными
  '''
  data['genres'] = data['genres'].apply(lambda x: x.replace('|', ' '))

  model = make_pipeline(TfidfVectorizer(),
                        NearestNeighbors(n_neighbors=10,
                                         n_jobs=-1,
                                         metric='euclidean'))
  model.fit(data['genres'])

  X_tfidf2 = model[0].transform(genres)
  res = model[1].kneighbors(X_tfidf2, return_distance=True)

  return data.iloc[res[1][0]]

In [62]:
recommend_10_movies(['Comedy Fantasy Crime'], movies)

Unnamed: 0,movieId,title,genres
9096,143559,L.A. Slasher (2015),Comedy Crime Fantasy
2206,2931,Time of the Gypsies (Dom za vesanje) (1989),Comedy Crime Drama Fantasy
4513,6686,"Medallion, The (2003)",Action Comedy Crime Fantasy
325,367,"Mask, The (1994)",Action Comedy Crime Fantasy
7816,92637,Pitfall (Otoshiana) (1962),Crime Drama Fantasy
5634,27328,Monday (2000),Action Comedy Crime Fantasy Thriller
5270,8666,Catwoman (2004),Action Crime Fantasy
5618,27134,Dark Portals: The Chronicles of Vidocq (Vidoc...,Action Crime Fantasy
9673,182823,Bright (2017),Action Crime Fantasy
1674,2253,Toys (1992),Comedy Fantasy


In [63]:
recommend_10_movies(['Drama'], movies)

Unnamed: 0,movieId,title,genres
50,55,Georgia (1995),Drama
30,31,Dangerous Minds (1995),Drama
25,26,Othello (1995),Drama
39,43,Restoration (1995),Drama
105,121,"Boys of St. Vincent, The (1992)",Drama
36,40,"Cry, the Beloved Country (1995)",Drama
55,62,Mr. Holland's Opus (1995),Drama
120,147,"Basketball Diaries, The (1995)",Drama
51,57,Home for the Holidays (1995),Drama
13,14,Nixon (1995),Drama
