### Рекомендации на основе содержания

#### Получение данных

In [311]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [312]:
!wget 'https://drive.google.com/uc?id=1m0rwReR09achL0xTM6QPoN4tykz5bOMx' -O MovieLens.zip

--2023-05-09 19:37:45--  https://drive.google.com/uc?id=1m0rwReR09achL0xTM6QPoN4tykz5bOMx
Resolving drive.google.com (drive.google.com)... 142.251.171.139, 142.251.171.138, 142.251.171.101, ...
Connecting to drive.google.com (drive.google.com)|142.251.171.139|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0g-84-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/6792ims57el7cnr72vil0ekgt70j0m8r/1683661050000/02611596255248067438/*/1m0rwReR09achL0xTM6QPoN4tykz5bOMx?uuid=6029c01e-f72a-4631-a1ad-e5d9e0f4c3ef [following]
--2023-05-09 19:37:47--  https://doc-0g-84-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/6792ims57el7cnr72vil0ekgt70j0m8r/1683661050000/02611596255248067438/*/1m0rwReR09achL0xTM6QPoN4tykz5bOMx?uuid=6029c01e-f72a-4631-a1ad-e5d9e0f4c3ef
Resolving doc-0g-84-docs.googleusercontent.com (doc-0g-84-docs.googleusercontent.com)... 108.177.111.132, 2607:f8b0:4001:c07::84
Connecting to d

In [313]:
!unzip MovieLens.zip

Archive:  MovieLens.zip
replace links.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [314]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

#### (Из лекции) Tfidf и ближайшие соседи по жанрам 

In [315]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [316]:
movies.genres.values

array(['Adventure|Animation|Children|Comedy|Fantasy',
       'Adventure|Children|Fantasy', 'Comedy|Romance', ..., 'Drama',
       'Action|Animation', 'Comedy'], dtype=object)

In [317]:
movie_genres = [change_string(g) for g in movies.genres.values]
movie_genres[:10]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller']

In [318]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(movie_genres)
X_train_tfidf

<9742x20 sparse matrix of type '<class 'numpy.float64'>'
	with 22084 stored elements in Compressed Sparse Row format>

In [319]:
pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf.get_feature_names_out())

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,0.482990,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,0.593662,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,0.575034,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,0.638968,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [320]:
neigh = NearestNeighbors(n_neighbors=7, metric='euclidean') 
neigh.fit(X_train_tfidf)

In [321]:
test = change_string("Adventure|Comedy|Fantasy|Documentary")

X_tfidf = tfidf.transform([test])

res = neigh.kneighbors(X_tfidf, return_distance=True)

In [322]:
res

(array([[0.38758619, 0.56525059, 0.58114553, 0.65617877, 0.65617877,
         0.66834044, 0.66834044]]),
 array([[8014, 5836, 8161, 7597, 4853,  863, 3376]]))

#### (Из лекции) Tfidf и ближайшие соседи по тегами 

In [323]:
movies_with_tags = movies.merge(tags, on='movieId')

In [324]:
movies_with_tags.tag.unique().shape

(1589,)

In [325]:
movies_with_tags.dropna(inplace=True)

In [326]:
def change_string(s):
    return str(s).replace(' ', '').replace('-', '').lower()

tag_strings = []
movies = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([change_string(s) for s in group.tag.values]))
    movies.append(movie)

  0%|          | 0/1572 [00:00<?, ?it/s]

In [327]:
tfidf_tag = TfidfVectorizer()
X_train_tfidf_tag = tfidf_tag.fit_transform(tag_strings)
X_train_tfidf_tag

<1572x1472 sparse matrix of type '<class 'numpy.float64'>'
	with 3598 stored elements in Compressed Sparse Row format>

In [328]:
pd.DataFrame(X_train_tfidf_tag.toarray(), columns=tfidf_tag.get_feature_names_out())

Unnamed: 0,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,2danimation,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [329]:
neigh_tag = NearestNeighbors(n_neighbors=10, p=1) 
neigh_tag.fit(X_train_tfidf_tag)

In [330]:
test = 'highschool pixar fun'

X_tfidf_tag = tfidf_tag.transform([test])

res = neigh_tag.kneighbors(X_tfidf_tag, return_distance=True)

In [331]:
for i in res[1][0]:
    print(movies[i], tag_strings[i])

Bug's Life, A (1998) pixar
In a Lonely Place (1950) l.a.
Magnolia (1999) l.a.
Toy Story (1995) pixar pixar fun
127 Hours (2010) stranded
21 Grams (2003) depressing
11'09"01 - September 11 (2002) terrorism
101 Dalmatians (One Hundred and One Dalmatians) (1961) disney
...And Justice for All (1979) lawyers
13 Going on 30 (2004) markruffalo


#### (Задание) Построить рекомендации на Tfidf по жанрам и тегам и оценить результат RMSE

Добавлем данные о рэйтинге

In [332]:
movies_with_tags_and_ratings = movies_with_tags.merge(ratings, on=['movieId', 'userId'])
movies_with_tags_and_ratings['avg_rating'] = np.nan;

Высчитываем средний рейтинг по фильму

In [333]:
for movie, group in tqdm(movies_with_tags_and_ratings.groupby('title')):
  movies_with_tags_and_ratings.loc[movies_with_tags_and_ratings['title'] == movie, 'avg_rating'] = group["rating"].mean()

  0%|          | 0/1464 [00:00<?, ?it/s]

In [334]:
movies_with_tags_and_ratings

Unnamed: 0,movieId,title,genres,userId,tag,timestamp_x,rating,timestamp_y,avg_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764,4.0,1122227329,3.833333
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825,4.0,978575760,3.833333
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013,3.5,1525286001,3.833333
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929,4.0,1528843890,3.750000
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932,4.0,1528843890,3.750000
...,...,...,...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,star wars,1528934552,4.0,1528934550,4.000000
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,anime,1537098582,3.5,1537098554,3.500000
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,comedy,1537098587,3.5,1537098554,3.500000
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,gintama,1537098603,3.5,1537098554,3.500000


In [335]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

Формируем данные для Tfidf, возьмем тэги и жанры по каждой строке

In [336]:
X = (movies_with_tags_and_ratings['tag'] + ' ' + [change_string(g) for g in movies_with_tags_and_ratings['genres'].values]).values
Y = movies_with_tags_and_ratings[movies_with_tags_and_ratings.columns[-1]]

In [337]:
Y

0       3.833333
1       3.833333
2       3.833333
3       3.750000
4       3.750000
          ...   
3471    4.000000
3472    3.500000
3473    3.500000
3474    3.500000
3475    3.500000
Name: avg_rating, Length: 3476, dtype: float64

In [338]:
X

array(['pixar Adventure Animation Children Comedy Fantasy',
       'pixar Adventure Animation Children Comedy Fantasy',
       'fun Adventure Animation Children Comedy Fantasy', ...,
       'comedy Action Animation Comedy SciFi',
       'gintama Action Animation Comedy SciFi',
       'remaster Action Animation Comedy SciFi'], dtype=object)

Формируем выборку

In [339]:
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y, test_size=0.2, random_state=2)

Напишем функцию по которой будем разбивать строку на токены в Tfidf

In [340]:
def my_tokenizer(text):
    if text:
        result = text.split(' ')
    else:
        result = []
    return result

In [341]:
vect = TfidfVectorizer(tokenizer=my_tokenizer)
X_train_vect = vect.fit_transform(X_train)

Обучаем Линейную регрессию

In [342]:
model = LinearRegression()
model.fit(X_train_vect, Y_train)   

Создаем пайплайн

In [343]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('vect', vect),('model', model)])

Получаем RMSE

In [344]:
from sklearn.metrics import mean_squared_error

y_pred = pipe.predict(X_test)
len(X_test)
len(y_pred)

mean_squared_error(Y_test, y_pred, squared=False)

0.8342945323896528

Повторим те же самые шаги, только теперь преобразуем наш датасет, чтобы одному фильму соответсвовала одна строка. 

In [345]:
import warnings
warnings.filterwarnings('ignore')

one_line_for_one_film = pd.DataFrame(columns=['title', 'genres', 'tag', 'avg_rating'])

for id, group in tqdm(movies_with_tags_and_ratings.groupby('movieId')):
    tags = ' '.join(group['tag'].values)
    one_line_for_one_film = one_line_for_one_film.append({'title': group['title'].values[0], 'genres': group['genres'].values[0], 'tag': tags, 'avg_rating': group['avg_rating'].values[0]}, ignore_index=True)

  0%|          | 0/1464 [00:00<?, ?it/s]

In [346]:
print(movies_with_tags_and_ratings['title'].nunique())
print(one_line_for_one_film['title'].nunique())

1464
1464


In [347]:
X2 = (one_line_for_one_film['tag'] + ' ' + [change_string(g) for g in one_line_for_one_film['genres'].values]).values
Y2 = one_line_for_one_film[one_line_for_one_film.columns[-1]]

In [348]:
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2,
                                                    Y2, test_size=0.2, random_state=2)

In [349]:
vect2 = TfidfVectorizer(tokenizer=my_tokenizer)
X_train_vect2 = vect2.fit_transform(X_train2)

In [350]:
model2 = LinearRegression()
model2.fit(X_train_vect2, Y_train2)   

Получаем второй RMSE

In [351]:
pipe2 = Pipeline([('vect2', vect2),('model2', model2)])

y_pred2 = pipe2.predict(X_test2)
len(X_test2)
len(y_pred2)

mean_squared_error(Y_test2, y_pred2, squared=False)

1.0052507176819745

### Выводы

1. Получилось что RMSE в случае, когда у нас одному фильму соответствует столько строк сколько у нас тэгов по фильму, меньше чем RMSE когда одному фильму соответсвует одна строка. (0.83 vs 1.0) А чем ниже RMSE тем лучше. Получается второй подход оказался хуже. С чем связано такое поведение? 

### Вопросы

В рамках задания RMSE оказался достаточно уровня? Ошибка примено в 0.8-1 балла является критичным для РС?