In [None]:
from surprise import KNNBaseline, KNNBasic, KNNWithMeans, SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split, KFold
from surprise.model_selection.validation import cross_validate

import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('../lecture-1/movies.csv')
ratings = pd.read_csv('../lecture-1/ratings.csv')

In [3]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [4]:
user_rating_mean = ratings.groupby('userId', sort=False)[['rating']].mean()
user_rating_mean.head()

Unnamed: 0_level_0,rating
userId,Unnamed: 1_level_1
1,4.366379
2,3.948276
3,2.435897
4,3.555556
5,3.636364


In [6]:
# количество оценок по фильмам
movie_rating_count = ratings.groupby('movieId', sort=False)[['rating']].count()
# средняя оценка фильма
movie_rating_mean = ratings.groupby('movieId', sort=False)[['rating']].mean()
# количество оценок по фильмам
movie_rating_count = ratings.groupby('movieId', sort=False)[['rating']].count()

In [7]:
movie_rating_count.head(2)

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,215
3,52


In [8]:
movie_rating_mean.head(2)

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.92093
3,3.259615


In [9]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [10]:
movies_with_ratings.head(2)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982703.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847434962.0


In [11]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.movieId,
    'rating': movies_with_ratings.rating
})

In [12]:
view_dict = movies_with_ratings[['movieId', 'title']].drop_duplicates().set_index('movieId').to_dict()['title']

In [13]:
dataset.head(2)

Unnamed: 0,uid,iid,rating
0,1.0,1,4.0
1,5.0,1,4.0


In [14]:
ratings.rating.min()

0.5

In [15]:
ratings.rating.max()

5.0

In [16]:
from sklearn import model_selection

In [17]:
# разделим обучающий набор на обучающий и валидационный.
X_train, X_val = model_selection.train_test_split(dataset, test_size=0.15, random_state=1)

In [18]:
algs = [KNNBaseline(), KNNBasic(), KNNWithMeans(), SVD()]
kf = KFold(n_splits=5, random_state=42)
reader = Reader(rating_scale=(0.5, 5.0))

In [19]:
def get_X_meta(algs, X, user_rating_mean, movie_rating_mean, movie_rating_count):
    '''
    Функция возвращает DataFrame, столбцы которого предсатвляют собой предсказания ансамбля algs
    на обучающем наборе данных, а также среднюю оценку пользователя, среднюю оценку фильма
    и количество оценок у фильма.
    Данный DataFrame будет использоваться для обучения мета модели.
    '''
    np.random.seed(42)
    X_surp = Dataset.load_from_df(X, reader)
    result_dfs = []
    for trainset, testset in kf.split(X_surp):
        result = []
        for i, alg in enumerate(algs):
            alg.fit(trainset)
            test_predict = alg.test(testset)
            result.append(pd.DataFrame(test_predict, columns=['uid', 'iid', 'rui', 'est', 'details']
                                      )[['uid', 'iid', 'rui', 'est']])
        result_df = result[0][['uid', 'iid', 'rui']]
        for i, res in enumerate(result):
            result_df['est_' + str(algs[i]).split(' object')[0].split('.')[-1]] = result[i]['est']
        result_dfs.append(result_df)
    fin_df = pd.concat(result_dfs)
    
    fin_df = pd.merge(fin_df, user_rating_mean.rename(columns={'rating':'user_rating_mean'}), 
                  left_on='uid', right_index=True)
                   
    fin_df = pd.merge(fin_df, movie_rating_mean.rename(columns={'rating':'movie_rating_mean'}), 
                  left_on='iid', right_index=True)
    
    fin_df = pd.merge(fin_df, movie_rating_count.rename(columns={'rating':'movie_rating_count'}), 
                  left_on='iid', right_index=True)
      
    return fin_df, algs

In [None]:
res_train, algs_train = get_X_meta(algs, X_train, user_rating_mean, movie_rating_mean, movie_rating_count)

In [21]:
X_meta_train = res_train.drop(['uid', 'iid', 'rui'], axis=1)
y_meta_train = res_train['rui']

In [71]:
from sklearn.ensemble import RandomForestRegressor

In [80]:
meta_alg = RandomForestRegressor(n_estimators=50)

In [81]:
meta_alg.fit(X_meta_train, y_meta_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=50,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [None]:
res_val, algs_val = get_X_meta(algs, X_val, user_rating_mean, movie_rating_mean, movie_rating_count)

In [82]:
res_val.head()

Unnamed: 0,uid,iid,rui,est_KNNBaseline,est_KNNBasic,est_KNNWithMeans,est_SVD,user_rating_mean,movie_rating_mean,movie_rating_count,meta_est
0,599.0,60365,1.5,2.672107,3.498678,3.498678,2.645168,2.64205,1.5,1,4.96
136,599.0,4015,2.0,1.647163,2.327801,1.614505,2.432642,2.64205,2.727273,33,4.5
2279,182.0,4015,1.5,3.07752,2.768786,3.14683,3.186372,3.511259,2.727273,33,4.97
2368,232.0,4015,3.0,1.966267,1.742857,1.934221,3.308747,3.25058,2.727273,33,4.41
198,599.0,3591,2.5,3.070241,4.0,2.811185,2.823023,2.64205,2.733333,15,4.96


In [83]:
res_val['meta_est'] = meta_alg.predict(res_val.iloc[:,3:10])

In [84]:
res_val.head()

Unnamed: 0,uid,iid,rui,est_KNNBaseline,est_KNNBasic,est_KNNWithMeans,est_SVD,user_rating_mean,movie_rating_mean,movie_rating_count,meta_est
0,599.0,60365,1.5,2.672107,3.498678,3.498678,2.645168,2.64205,1.5,1,1.493333
136,599.0,4015,2.0,1.647163,2.327801,1.614505,2.432642,2.64205,2.727273,33,2.55
2279,182.0,4015,1.5,3.07752,2.768786,3.14683,3.186372,3.511259,2.727273,33,2.86
2368,232.0,4015,3.0,1.966267,1.742857,1.934221,3.308747,3.25058,2.727273,33,3.22
198,599.0,3591,2.5,3.070241,4.0,2.811185,2.823023,2.64205,2.733333,15,1.51


In [85]:
from sklearn.metrics import mean_squared_error

In [156]:
np.sqrt(mean_squared_error(res_val['rui'], res_val['meta_est']))

0.868195068721334

In [None]:
# получаем DataFrame с признаками для обучения мета алгоритма на всем датасете.
res_fin, algs_fin = get_X_meta(algs, dataset, user_rating_mean, movie_rating_mean, movie_rating_count)

In [34]:
res_fin.head()

Unnamed: 0,uid,iid,rui,est_KNNBaseline,est_KNNBasic,est_KNNWithMeans,est_SVD,user_rating_mean,movie_rating_mean,movie_rating_count
0,599.0,5418,3.0,3.141719,3.794167,3.013751,3.215928,2.64205,3.816964,112
14749,282.0,5418,4.5,4.137919,4.0206,4.215653,4.056181,4.033755,3.816964,112
14866,414.0,5418,4.0,3.858364,4.126701,3.691005,3.742565,3.391957,3.816964,112
5858,448.0,5418,3.0,3.314452,3.869553,3.123381,3.388796,2.847371,3.816964,112
3799,391.0,5418,4.0,3.755288,3.670617,3.850964,4.117101,3.715026,3.816964,112


In [35]:
X_meta_fin = res_fin.drop(['uid', 'iid', 'rui'], axis=1)
y_meta_fin = res_fin['rui']

In [87]:
meta_alg_fin = RandomForestRegressor(n_estimators=50)

In [88]:
meta_alg_fin.fit(X_meta_fin, y_meta_fin)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=50,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [89]:
res_fin.head(1)

Unnamed: 0,uid,iid,rui,est_KNNBaseline,est_KNNBasic,est_KNNWithMeans,est_SVD,user_rating_mean,movie_rating_mean,movie_rating_count
0,599.0,5418,3.0,3.141719,3.794167,3.013751,3.215928,2.64205,3.816964,112


In [91]:
def rate_predictions(uid, df, algs, meta_alg):
    """
    Функция возвращающает DataFrame с ожидаемыми оценками для фильмов, которые еще
    не смотрел пользователь uid.
    """
    predictions = []
    film_ids = set(df['iid'].unique())
    watched_film_ids = set(df[df['uid']==uid]['iid'].unique())
    not_watched_film_ids = np.array(list(film_ids - watched_film_ids))
    user_mean_rate = df[df['uid']==uid]['user_rating_mean'].iloc[0]
    for film_id in not_watched_film_ids:
        alg_predictions = [alg.predict(uid, film_id).est for alg in algs]
        movie_mean_rate = df[df['iid']==film_id]['movie_rating_mean'].iloc[0]
        movie_rate_count = df[df['iid']==film_id]['movie_rating_count'].iloc[0]
        predictions.append((*alg_predictions, user_mean_rate, movie_mean_rate, movie_rate_count))
    
    return pd.DataFrame(np.c_[not_watched_film_ids, meta_alg.predict(np.array(predictions))], columns = ['iid', 'est_rate'])

In [140]:
film_rates = rate_predictions(15, res_fin, algs_fin, meta_alg_fin)

In [141]:
# добавим названия фильмов.
film_rates['Title'] = film_rates['iid'].map(view_dict)

In [143]:
film_rates = pd.merge(film_rates, movie_rating_count.rename(columns={'rating': 'rate_count'}), left_on='iid', right_index=True)

In [145]:
film_rates.head(1)

Unnamed: 0,iid,est_rate,Title,rate_count
0,2.0,2.81,Jumanji (1995),110


In [147]:
mean_rate_count = film_rates['rate_count'].mean()
std_rate_count = film_rates['rate_count'].std()

In [148]:
# прежде, чем рекомендовать фильмы пользователю 15, посчитаем нормированную оценку фильмам,
# учитывающую количество оценок.
film_rates['normed_est_rate'] = film_rates['est_rate']*(film_rates['rate_count'] - mean_rate_count) / (film_rates['rate_count'])

In [150]:
# топ 10 рекомендуемых фильмов для пользователя 15.
film_rates.sort_values('normed_est_rate', ascending=False).head(10)

Unnamed: 0,iid,est_rate,Title,rate_count,normed_est_rate
1008,1213.0,4.25,Goodfellas (1990),126,3.938416
4080,4973.0,4.19,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",120,3.867455
550,608.0,3.98,Fargo (1996),181,3.776876
5539,7153.0,3.94,"Lord of the Rings: The Return of the King, The...",185,3.743265
1004,1208.0,4.08,Apocalypse Now (1979),107,3.727764
46,50.0,3.89,"Usual Suspects, The (1995)",204,3.713852
1015,1221.0,3.99,"Godfather: Part II, The (1974)",129,3.70428
424,457.0,3.89,"Fugitive, The (1993)",190,3.700873
104,110.0,3.85,Braveheart (1995),237,3.699938
1428,1704.0,3.94,Good Will Hunting (1997),141,3.681872
