In [2]:
from surprise import KNNWithMeans, KNNBasic
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import pandas as pd
from surprise.model_selection import cross_validate

from tqdm.notebook import tqdm

In [3]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [4]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [5]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [6]:
dataset.uid.value_counts()

123100.0    23715
117490.0     9279
134596.0     8381
212343.0     7884
242683.0     7515
            ...  
223518.0        1
10564.0         1
148678.0        1
148663.0        1
185369.0        1
Name: uid, Length: 283228, dtype: int64

In [7]:
ser = dataset.iid.value_counts()

Я обнаружил, что данном датасете уникальных фильмов < уникальных юзеров. Буду использовать подход ITEM-BASED COLLABORATIVE FILTERING. 

Для начала уберем малоинформативные объекты.

In [8]:
df = pd.DataFrame({'iid':ser.index, 'value':ser.values})

n_dataset = dataset.join(df.set_index('iid'), on='iid').reset_index(drop=True)
n_dataset = n_dataset.query('value > 100 ')
n_dataset

Unnamed: 0,uid,iid,rating,value
0,4.0,Toy Story (1995),4.0,68469
1,10.0,Toy Story (1995),5.0,68469
2,14.0,Toy Story (1995),4.5,68469
3,15.0,Toy Story (1995),4.0,68469
4,22.0,Toy Story (1995),4.0,68469
...,...,...,...,...
27752361,261224.0,Crazy Rich Asians (2018),3.5,110
27752362,261870.0,Crazy Rich Asians (2018),0.5,110
27752363,266861.0,Crazy Rich Asians (2018),3.5,110
27752364,275841.0,Crazy Rich Asians (2018),3.0,110


In [9]:
dataset

Unnamed: 0,uid,iid,rating
0,4.0,Toy Story (1995),4.0
1,10.0,Toy Story (1995),5.0
2,14.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),4.0
4,22.0,Toy Story (1995),4.0
...,...,...,...
27757648,176871.0,Les tribulations d'une caissière (2011),2.0
27757649,81710.0,Her Name Was Mumu (2016),2.0
27757650,33330.0,Flora (2017),2.0
27757651,206009.0,Leal (2018),2.5


Убрали ~ 2% фильмов

In [10]:
n_dataset.drop(['value'], axis='columns', inplace=True)
n_dataset

Unnamed: 0,uid,iid,rating
0,4.0,Toy Story (1995),4.0
1,10.0,Toy Story (1995),5.0
2,14.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),4.0
4,22.0,Toy Story (1995),4.0
...,...,...,...
27752361,261224.0,Crazy Rich Asians (2018),3.5
27752362,261870.0,Crazy Rich Asians (2018),0.5
27752363,266861.0,Crazy Rich Asians (2018),3.5
27752364,275841.0,Crazy Rich Asians (2018),3.0


Перемешаем 

In [11]:
n1_dataset = n_dataset.sample(frac=1).reset_index(drop=True)

Используем первые 100000 строк для тестов.  
Остальные строки для обучения.  

In [12]:
dataset_test = n1_dataset[:100000]
dataset_train = n1_dataset[100001:]

Обучим модель KNNWithMeans на dataset_train[100001:1100000].  

In [71]:
dataset_train_1 = dataset_train[100001:1100000]

In [72]:
reader = Reader(rating_scale=(dataset_train_1.rating.min(), dataset_train_1.rating.max()))
data = Dataset.load_from_df(dataset_train_1, reader)

In [73]:
trainset, _ = train_test_split(data, test_size=.01)

In [74]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x2569bc290>

Проведем тест модели KNNWithMeans, обученной на 1000000 знач.

In [75]:
reader = Reader(rating_scale=(dataset_test.rating.min(), dataset_test.rating.max()))
data_test = Dataset.load_from_df(dataset_test, reader)
_, testset = train_test_split(data_test, test_size=.99)

In [76]:
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 1.0219


1.0219298181622125

Для улучшения модели KNNWithMeans буду использовать идею стекинга, только все слабые ученики будут KNNWithMeans. Для этого я разбил dataset_train на 10 фолдов. Напомню, что я изначально отделил 1 фолд на тест.

In [29]:
def train_algo(dataset):
    reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))
    data = Dataset.load_from_df(dataset, reader)
    trainset, testset = train_test_split(data, test_size=.10)
    algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
    algo.fit(trainset)
    test_pred = algo.test(testset)
    movieId_ratings = pd.DataFrame()
    userId = []
    movie = []
    rating = []
#     for var in tqdm(test_pred):
    for var in test_pred:
        userId.append(var[0])
        movie.append(var[1])
        rating.append(var[3])
    movieId_ratings['userId'] = userId
    movieId_ratings['movie'] = movie
    movieId_ratings['ratings'] = rating
    return movieId_ratings

In [66]:
movie_ratings_train = pd.DataFrame()
for i in tqdm(range(0,10)):
    dataset_train_n = dataset_train[i*1000000:(i+1)*1000000]
    movie_ratings_train_next = train_algo(dataset_train_n)
    movie_ratings_train = pd.concat([movie_ratings_train, movie_ratings_train_next])

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))





In [67]:
movie_ratings_train

Unnamed: 0,userId,movie,ratings
0,40482.0,Dazed and Confused (1993),3.347800
1,93806.0,Chariots of Fire (1981),4.066941
2,203652.0,Seven (a.k.a. Se7en) (1995),2.593205
3,90252.0,Lady in the Water (2006),2.482451
4,228998.0,Pirates of Silicon Valley (1999),3.769231
...,...,...,...
99995,80406.0,"Wonderful, Horrible Life of Leni Riefenstahl, ...",3.976190
99996,157809.0,Gone in 60 Seconds (2000),2.184211
99997,137786.0,Four Rooms (1995),3.462085
99998,171745.0,Survivor (2015),3.250000


In [68]:
reader = Reader(rating_scale=(movie_ratings_train.ratings.min(), movie_ratings_train.ratings.max()))
data = Dataset.load_from_df(movie_ratings_train, reader)
trainset, _ = train_test_split(data, test_size=.01)

In [69]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x25698a750>

In [70]:
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.9690


0.9689912242253184

Вывод:  
Результат на лицо, при "стекинге" RMSE: 0.9690 vs RMSE: 1.0219.   
Результат: улучшение модели на ~0,05

Все тоже самое проведем для модели SVD на dataset_train[100001:1100000].

In [77]:
dataset_train_1 = dataset_train[100001:1100000]
reader = Reader(rating_scale=(dataset_train_1.rating.min(), dataset_train_1.rating.max()))
data = Dataset.load_from_df(dataset_train_1, reader)
trainset, _ = train_test_split(data, test_size=.01)
algo_SVD = SVD(n_factors=20, n_epochs=20)
algo_SVD.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x174879a10>

In [78]:
reader = Reader(rating_scale=(dataset_test.rating.min(), dataset_test.rating.max()))
data_test = Dataset.load_from_df(dataset_test, reader)
_, testset = train_test_split(data_test, test_size=.99)
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 1.0217


1.0217255665562859

In [88]:
def train_algo_SVD(dataset):
    reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))
    data = Dataset.load_from_df(dataset, reader)
    trainset, testset = train_test_split(data, test_size=.10)
#     algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
    algo = SVD(n_factors=20, n_epochs=20)
    algo.fit(trainset)
    test_pred = algo.test(testset)
    movieId_ratings = pd.DataFrame()
    userId = []
    movie = []
    rating = []
    for var in tqdm(test_pred):
#     for var in test_pred:
        userId.append(var[0])
        movie.append(var[1])
        rating.append(var[3])
    movieId_ratings['userId'] = userId
    movieId_ratings['movie'] = movie
    movieId_ratings['ratings'] = rating
    return movieId_ratings

Для улучшения модели SVD буду использовать идею стекинга, слабые ученики будут KNNWithMeans и SVD. Для этого я разбил dataset_train на 10 фолдов. Напомню, что я изначально отделил 1 фолд на тест.
Сильным учеником будет KNNWithMeans.

In [89]:
movie_ratings_train_SVD = pd.DataFrame()
for i in tqdm(range(0,10)):
    dataset_train_n = dataset_train[i*1000000:(i+1)*1000000]
    if i % 2:
        movie_ratings_train_next = train_algo_SVD(dataset_train_n)
    else:
        movie_ratings_train_next = train_algo(dataset_train_n)
    movie_ratings_train_SVD = pd.concat([movie_ratings_train_SVD, movie_ratings_train_next])

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))





In [90]:
reader = Reader(rating_scale=(movie_ratings_train_next.ratings.min(), movie_ratings_train_next.ratings.max()))
data = Dataset.load_from_df(movie_ratings_train_next, reader)
trainset, _ = train_test_split(data, test_size=.01)
algo_ct = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo_ct.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x29c6e3e50>

In [91]:
reader = Reader(rating_scale=(dataset_test.rating.min(), dataset_test.rating.max()))
data_test = Dataset.load_from_df(dataset_test, reader)
_, testset = train_test_split(data_test, test_size=.99)
test_pred = algo_ct.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 1.0094


1.0093792277953062

Вывод:  
При "стекинге" RMSE: 1.0094 vs RMSE: 1.0217.   
Результат: улучшение модели на ~0,01