### 1.Использовать dataset MovieLens
### 2.Построить рекомендации (регрессия, предсказываем оценку) на фичах:
***1)TF-IDF на тегах и жанрах***
***2)Средние оценки (+ median, variance, etc.) пользователя и фильма***
### 3.Оценить RMSE на тестовой выборке

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
def change_string(s):
    return ' '.join(s.replace(' ','').split('|'))

In [7]:
movies['genres']=movies['genres'].apply(change_string)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movies['genres'].unique()[:10]

array(['Adventure Animation Children Comedy Fantasy',
       'Adventure Children Fantasy', 'Comedy Romance',
       'Comedy Drama Romance', 'Comedy', 'Action Crime Thriller',
       'Adventure Children', 'Action', 'Action Adventure Thriller',
       'Comedy Horror'], dtype=object)

### Tfidf для жанров

In [9]:
# Трансформируем в вектор столбец жанры
count_vect=CountVectorizer()
X_train_count=count_vect.fit_transform(movies['genres'])
X_train_count.toarray()

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
count_vect.vocabulary_

{'adventure': 1,
 'animation': 2,
 'children': 3,
 'comedy': 4,
 'fantasy': 8,
 'romance': 17,
 'drama': 7,
 'action': 0,
 'crime': 5,
 'thriller': 19,
 'horror': 11,
 'mystery': 14,
 'sci': 18,
 'fi': 9,
 'war': 20,
 'musical': 13,
 'documentary': 6,
 'imax': 12,
 'western': 21,
 'film': 10,
 'noir': 16,
 'nogenreslisted': 15}

In [11]:
tfidf=TfidfTransformer()
X_train_tfidf=tfidf.fit_transform(X_train_count)
X_train_tfidf.toarray().shape

(9742, 22)

In [12]:
# воспользуемся методом ближайших соседей для определения ближайшего похожего фильма.
NN = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
NN.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=-1, n_neighbors=7, p=2, radius=1.0)

In [13]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf.transform(predict)

result = NN.kneighbors(X_tfidf2)
result

(array([[0.42079615, 0.53300564, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608]]),
 array([[6774, 9096, 3302, 5737, 8361, 3576, 3582]], dtype=int64))

In [14]:
movies['title'][result[1][0]]

6774                                Hancock (2008)
9096                           L.A. Slasher (2015)
3302    Adventures of Baron Munchausen, The (1988)
5737    Life Aquatic with Steve Zissou, The (2004)
8361                   Knights of Badassdom (2013)
3576                           Black Knight (2001)
3582                            Jabberwocky (1977)
Name: title, dtype: object

### Tfidf для тэгов

In [15]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [16]:
def change_string_to_lower_register(x):
    return x.lower()

In [17]:
tags['tag']=tags['tag'].apply(change_string_to_lower_register)
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,boxing story,1445715207
4,2,89774,mma,1445715200


In [18]:
# Было 1579 без перевода в нижний регистр
tags['tag'].unique().shape

(1475,)

In [19]:
count_vect2=CountVectorizer()
X_train_count_tag=count_vect2.fit_transform(tags['tag'])

In [20]:
tfidf2=TfidfTransformer()
X_train_tfidf_tag=tfidf2.fit_transform(X_train_count_tag)
X_train_tfidf_tag.toarray().shape

(3683, 1744)

In [21]:
NN_tag = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
NN_tag.fit(X_train_tfidf_tag)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=-1, n_neighbors=7, p=2, radius=1.0)

In [22]:
test = change_string("funny mma pixar")

predict = count_vect2.transform([test])
X_tfidf3 = tfidf2.transform(predict)

result_tag = NN_tag.kneighbors(X_tfidf3)
result_tag

(array([[0., 0., 1., 1., 1., 1., 1.]]),
 array([[1750, 2102,  187, 1966,  780,  696,  453]], dtype=int64))

In [23]:
tags['movieId'][result_tag[1][0]]

1750     3160
2102     6273
187     53464
1966     4980
780      1240
696        32
453      7254
Name: movieId, dtype: int64

In [24]:
movies['title'][tags['movieId'][result_tag[1][0]]]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


3160                          Chopper (2000)
6273                         Accepted (2006)
53464                                    NaN
4980                     Raising Cain (1992)
1240     Fast, Cheap & Out of Control (1997)
32                               Babe (1995)
7254             Captain Newman, M.D. (1963)
Name: title, dtype: object

### Средние оценки (+ median, variance, etc.) пользователя и фильма

In [25]:
# объединим таблицу tfidf по жанрам фильмов

In [42]:
genre_tfidf = pd.DataFrame(X_train_tfidf.toarray(), columns=count_vect.get_feature_names())
genre_tfidf.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,...,imax,musical,mystery,nogenreslisted,noir,romance,sci,thriller,war,western
0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
df_movies=pd.concat([movies, genre_tfidf], axis=1)
df_movies.head()

Unnamed: 0,movieId,title,genres,action,adventure,animation,children,comedy,crime,documentary,...,imax,musical,mystery,nogenreslisted,noir,romance,sci,thriller,war,western
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure Children Fantasy,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy Romance,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy Drama Romance,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# объединим по тэгам 
tag_tfidf = pd.DataFrame(X_train_tfidf_tag.toarray(), columns=count_vect2.get_feature_names())
tag_tfidf.head()

Unnamed: 0,06,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001,250,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
df_tags=pd.concat([tags,tag_tfidf], axis=1)
df_tags.drop(df_tags.columns[5:1600],axis=1, inplace=True)
df_tags.head()

Unnamed: 0,userId,movieId,timestamp,06,tradition,tragedy,tragic,train,training,trains,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,2,60756,1445714994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,60756,1445714996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,60756,1445714992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,89774,1445715207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,89774,1445715200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
df=pd.merge(df_movies,ratings, how='outer', on='movieId')
df2=pd.merge(df,df_tags, how='outer', on=['movieId','userId'])
df2.head()

Unnamed: 0,movieId,title,genres,action,adventure,animation,children,comedy,crime,documentary,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,,,,,,,,,,
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,,,,,,,,,,
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,,,,,,,,,,
3,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,,,,,,,,,,
4,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,,,,,,,,,,


In [54]:
df2.dropna(inplace=True)

In [56]:
movies_statistic=df2.groupby('movieId')[['rating']].agg(['mean','count','std','var']).reset_index()
movies_statistic.columns=['movieId', 'mov_mean','mov_count','mov_std','mov_var']
users_statistic=df2.groupby('userId')[['rating']].agg(['mean','count','std','var']).reset_index()
users_statistic.columns=['userId', 'us_mean','us_count','us_std','us_var']

In [57]:
df_with_stats = pd.merge(df2, movies_statistic,  how='outer', on=['movieId'])
df_with_stats = pd.merge(df_with_stats, users_statistic,  how='outer', on=['userId'])


In [63]:
df_with_stats.dropna(inplace=True)
df_with_stats.head()

Unnamed: 0,movieId,title,genres,action,adventure,animation,children,comedy,crime,documentary,...,zombies,zooey,mov_mean,mov_count,mov_std,mov_var,us_mean,us_count,us_std,us_var
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,3.833333,3,0.288675,0.083333,3.777778,9,0.666667,0.444444
2,1246,Dead Poets Society (1989),Drama,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.5,2,0.0,0.0,3.777778,9,0.666667,0.444444
4,33660,Cinderella Man (2005),Drama Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.25,2,0.353553,0.125,3.777778,9,0.666667,0.444444
7,37729,Corpse Bride (2005),Animation Comedy Fantasy Musical Romance,0.0,0.0,0.497199,0.0,0.257724,0.0,0.0,...,0.0,0.0,3.785714,7,0.566947,0.321429,3.777778,9,0.666667,0.444444
9,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,3.833333,3,0.288675,0.083333,3.701909,1414,0.816109,0.666033


In [358]:
#sorted(count_vect.vocabulary_.items(), key=lambda kv: kv[1])

In [107]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [108]:
y = df_with_stats['rating']


In [109]:
X=df_with_stats.drop(columns=['genres', 'title', 'rating', 'movieId'])
X.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,...,zombies,zooey,mov_mean,mov_count,mov_std,mov_var,us_mean,us_count,us_std,us_var
0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,...,0.0,0.0,3.833333,3,0.288675,0.083333,3.777778,9,0.666667,0.444444
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,4.5,2,0.0,0.0,3.777778,9,0.666667,0.444444
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.540377,0.0,0.0,...,0.0,0.0,4.25,2,0.353553,0.125,3.777778,9,0.666667,0.444444
7,0.0,0.0,0.497199,0.0,0.257724,0.0,0.0,0.0,0.465189,0.0,...,0.0,0.0,3.785714,7,0.566947,0.321429,3.777778,9,0.666667,0.444444
9,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,...,0.0,0.0,3.833333,3,0.288675,0.083333,3.701909,1414,0.816109,0.666033


Строим модель, разделив на трейновую и тестовую выборки

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [111]:
LR=LinearRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)

In [114]:
mean_squared_error(y_test,y_pred)**0.5

1920.335613197333