### Оксана Рудковская, Рекомендации на основе содержания

In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv',index_col = 'movieId')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [4]:
tags.movieId.unique().shape

(1572,)

Создаю таблицу с id фильмами и сгруппированными тегами:

In [5]:
tag_group = pd.DataFrame(columns = ['movieId', 'tags'])

i = 0
for movieid, group in tqdm(tags.groupby('movieId')):
    tag_group.loc[i] = [movieid,' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values])]
    i += 1

HBox(children=(FloatProgress(value=0.0, max=1572.0), HTML(value='')))




In [6]:
movies_with_tags = movies.join(tag_group.set_index('movieId'), on='movieId')

In [7]:
tag_group

Unnamed: 0,movieId,tags
0,1,pixar pixar fun
1,2,fantasy magicboardgame RobinWilliams game
2,3,moldy old
3,5,pregnancy remake
4,7,remake
...,...,...
1567,183611,Comedy funny RachelMcAdams
1568,184471,adventure AliciaVikander videogameadaptation
1569,187593,JoshBrolin RyanReynolds sarcasm
1570,187595,EmiliaClarke starwars


In [8]:
rating = ratings.groupby('movieId')['rating'].mean() # Средние оценки для фильмов

In [9]:
movies = movies.join(rating)

In [10]:
movies_all = movies.join(tag_group.set_index('movieId'), on='movieId')

In [11]:
movies_all.info() # объединила все нужные мне фичи в один датафрейм

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9742 entries, 1 to 193609
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   title   9742 non-null   object 
 1   genres  9742 non-null   object 
 2   rating  9724 non-null   float64
 3   tags    1572 non-null   object 
dtypes: float64(1), object(3)
memory usage: 700.5+ KB


In [12]:
movies_all.dropna(inplace=True)

In [13]:
y = movies_all.rating
del movies_all['rating']

In [14]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

Применяю CountVectorizer для жанров и тегов

In [15]:
movie_genres = [change_string(g) for g in movies_all.genres.values]

In [16]:
count_vect = CountVectorizer()
X_counts_genres = count_vect.fit_transform(movie_genres)

In [17]:
tfidf_transformer = TfidfTransformer()
X_tfidf_genres = tfidf_transformer.fit_transform(X_counts_genres)

In [18]:
count_vect = CountVectorizer()
X_counts_tags = count_vect.fit_transform(list(movies_all.tags.values))

In [19]:
tfidf_transformer = TfidfTransformer()
X_tfidf_tags = tfidf_transformer.fit_transform(X_counts_tags)

Объединяю tf-idf признаки по тегам и жанрам:

In [20]:
X_tr = pd.DataFrame(np.hstack((X_tfidf_tags.toarray(), X_tfidf_genres.toarray())))

In [21]:
X_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1479,1480,1481,1482,1483,1484,1485,1486,1487,1488
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.766978,0.000000,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.766978,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1549,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.634189,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1550,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.675263,0.0,0.0,0.0
1552,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.502408,0.0,0.0,0.0


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tr, y, test_size = 0.2)

Обучаю модель:

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [24]:
params_rf = {'n_estimators' : [100,200,500,800,1000,1200],
           'max_depth' : [3,5,7,10,15,25,40,None],
           'min_samples_split':[2,4,6,10],
           'min_samples_leaf':[2,4,6,8]   
           }

search_rfr = RandomizedSearchCV(RandomForestRegressor(), params_rf, scoring='neg_mean_squared_error',cv=5, n_jobs=6, n_iter = 10)
search_rfr.fit(X_train,y_train)


print(search_rfr.best_params_)
print(search_rfr.best_estimator_)

{'n_estimators': 200, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_depth': 7}
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=7, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=6, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)


In [25]:
RFR = search_rfr.best_estimator_
RFR.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=7, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=6, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [26]:
rmse_rf = np.sqrt(mean_squared_error(y_test, RFR.predict(X_test)))

In [27]:
rmse_rf

0.47791383511321317