In [1]:
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import pandas as pd
from surprise.model_selection import cross_validate


In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [4]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [45]:
dataset.head(100)

Unnamed: 0,uid,iid,rating
0,4.0,Toy Story (1995),4.0
1,10.0,Toy Story (1995),5.0
2,14.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),4.0
4,22.0,Toy Story (1995),4.0
...,...,...,...
95,399.0,Toy Story (1995),4.0
96,400.0,Toy Story (1995),3.0
97,402.0,Toy Story (1995),4.5
98,408.0,Toy Story (1995),3.0


In [49]:
dataset = dataset.sort_values(by=['uid'])
dataset

Unnamed: 0,uid,iid,rating
16698601,1.0,Hollow Man (2000),2.0
9445638,1.0,Waiting for Guffman (1996),4.5
2425369,1.0,Three Colors: Blue (Trois couleurs: Bleu) (1993),3.5
11890107,1.0,Weird Science (1985),4.5
8277415,1.0,Better Off Dead... (1985),4.5
...,...,...,...
7446677,283228.0,Raiders of the Lost Ark (Indiana Jones and the...,5.0
20768504,283228.0,"Night at the Opera, A (1935)",4.5
11756169,283228.0,Splash (1984),4.0
14922737,283228.0,Spaceballs (1987),2.5


In [5]:
ratings.rating.min()

0.5

In [6]:
ratings.rating.max()

5.0

In [74]:
dataset_1 = dataset[:1300000]
dataset_1.uid.value_counts()

4796.0     4874
7705.0     4546
11109.0    3732
2025.0     3238
2150.0     2966
           ... 
5443.0        1
1348.0        1
12169.0       1
4319.0        1
8885.0        1
Name: uid, Length: 13094, dtype: int64

In [75]:
dataset_1.iid.value_counts()

Forrest Gump (1994)                  4470
Shawshank Redemption, The (1994)     4453
Pulp Fiction (1994)                  4235
Silence of the Lambs, The (1991)     4039
Matrix, The (1999)                   3854
                                     ... 
Rabbit (2005)                           1
When We Leave (Die Fremde) (2010)       1
Such Good People (2014)                 1
Scarecrows (1988)                       1
Thunderbird Six (1968)                  1
Name: iid, Length: 23478, dtype: int64

In [120]:
dataset_2 = dataset.sort_values(by=['iid'])
dataset_2

Unnamed: 0,uid,iid,rating
23439985,61133.0,"""Great Performances"" Cats (1998)",0.5
23439946,16196.0,"""Great Performances"" Cats (1998)",3.0
23439989,64533.0,"""Great Performances"" Cats (1998)",2.0
23440089,191814.0,"""Great Performances"" Cats (1998)",0.5
23440066,160365.0,"""Great Performances"" Cats (1998)",3.5
...,...,...,...
27186234,236981.0,貞子3D (2012),2.5
27186229,183200.0,貞子3D (2012),1.0
27186230,202065.0,貞子3D (2012),1.0
27476240,123100.0,줄탁동시 (2012),3.0


In [136]:
dataset_ = dataset_2[:5000000]
dataset_.uid.value_counts()

123100.0    4013
117490.0    1701
134596.0    1385
212343.0    1375
242683.0    1300
            ... 
21837.0        1
86184.0        1
21839.0        1
86183.0        1
17228.0        1
Name: uid, Length: 258246, dtype: int64

In [137]:
dataset_.iid.value_counts()

Braveheart (1995)                             68803
American Beauty (1999)                        60820
Apollo 13 (1995)                              58665
Back to the Future (1985)                     57492
Batman (1989)                                 54448
                                              ...  
Battement de coeur (1940)                         1
Bang! (1977)                                      1
Black Circle Boys (1997)                          1
Beside Bowie: The Mick Ronson Story (2017)        1
An Uncertain Season (1988)                        1
Name: iid, Length: 9032, dtype: int64

In [138]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset_, reader)

In [139]:
trainset, testset = train_test_split(data, test_size=.1)

In [140]:
algo_2 = KNNWithMeans(k=5, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [141]:
algo_2.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x4539f88d0>

In [142]:
test_pred = algo_2.test(testset)

In [143]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8734


0.8734324188784747

In [144]:
cross_validate(algo_2, data, measures=['RMSE'], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8829  0.8821  0.8819  0.8823  0.8824  0.8823  0.0003  
Fit time          56.95   58.74   56.06   55.81   58.13   57.14   1.14    
Test time         97.83   97.21   96.00   365.76  2415.12 614.39  906.36  


{'test_rmse': array([0.88286774, 0.88205122, 0.88193261, 0.88227287, 0.88235712]),
 'fit_time': (56.952451944351196,
  58.73997497558594,
  56.06044292449951,
  55.80573105812073,
  58.12677311897278),
 'test_time': (97.82983112335205,
  97.21352410316467,
  96.00209498405457,
  365.7647318840027,
  2415.122618198395)}

Вначале хотел обучить модель на всем датасете - итог ram не выдержал.  
Потом брал первые 100000 строк из датасета - итог высокое значение RMSE (от 1.06)  
Затем решил отсортировать датасет по uid и iid.  
В итоге обнаружил, что кол-во уникальных фильмов 9032, а пользователей 258246, при обработке 5000000 строк датасета, поэтому решил использовать ITEM-BASED COLLABORATIVE FILTERING, тк пользователей гораздо больше, чем объектов. В итоге получил на 5 фолдах при кросс-валидации RMSE (testset)    0.8829  0.8821  0.8819  0.8823  0.8824  
При деление данных на test и train - получил RMSE: 0.8734.  
На этом я остановился. 