ПАКЕТ SURPRISE

используйте данные MovieLens 1M

можно использовать любые модели из пакета

получите RMSE на тестовом сете 0.87 и ниже

Комментарий преподавателя :

В ДЗ на датасет 1М может не хватить RAM. Можно сделать на 100K. Качество RMSE предлагаю считать на основе CrossValidation (5 фолдов), а не отложенном датасете.

In [1]:
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import pandas as pd

In [2]:
movies = pd.read_table('ml-1m/movies.dat', sep='::', names=['movieId', 'title', 'genres'])
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', names=['userId', 'movieId', 'rating', 'timestamp'])

  return read_csv(**locals())


In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   userId     1000209 non-null  int64
 1   movieId    1000209 non-null  int64
 2   rating     1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [5]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [6]:
movies_with_ratings[movies_with_ratings.userId == 2.0].title.unique()

array(['Get Shorty (1995)', 'Broken Arrow (1996)', 'Braveheart (1995)',
       'Desperado (1995)', 'Die Hard: With a Vengeance (1995)',
       'Ed Wood (1994)',
       'Like Water for Chocolate (Como agua para chocolate) (1992)',
       'Outbreak (1995)', 'Shawshank Redemption, The (1994)',
       'Clear and Present Danger (1994)', 'Forrest Gump (1994)',
       'Maverick (1994)', 'True Lies (1994)', 'Cliffhanger (1993)',
       'Demolition Man (1993)', 'Fugitive, The (1993)',
       'Getaway, The (1994)', 'Jurassic Park (1993)', 'Mr. Jones (1993)',
       'Remains of the Day, The (1993)',
       'Terminator 2: Judgment Day (1991)', 'Dances with Wolves (1990)',
       'Silence of the Lambs, The (1991)', 'Courage Under Fire (1996)',
       'Mission: Impossible (1996)', 'Twister (1996)',
       'Independence Day (ID4) (1996)', "Breakfast at Tiffany's (1961)",
       'Gone with the Wind (1939)', 'Picnic (1955)',
       'Bonnie and Clyde (1967)', 'Platoon (1986)',
       "Sophie's Choice (1

In [7]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [8]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),5.0
1,6.0,Toy Story (1995),4.0
2,8.0,Toy Story (1995),4.0
3,9.0,Toy Story (1995),5.0
4,10.0,Toy Story (1995),5.0


In [9]:
ratings.rating.min()

1

In [10]:
ratings.rating.max()

5

In [11]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [12]:
trainset, testset = train_test_split(data, test_size=.15)

In [13]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1326ed1c0>

In [14]:
test_pred = algo.test(testset)

In [15]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8861


0.8860558168582788

In [16]:
algo.predict(uid=2, iid='Fight Club (1999)')

Prediction(uid=2, iid='Fight Club (1999)', r_ui=None, est=3.9087507925032803, details={'actual_k': 50, 'was_impossible': False})

In [17]:
from surprise import SVD
from surprise.model_selection import KFold

accuracy_mean = 0
n_splits = 5
# define a cross-validation iterator
kf = KFold(n_splits=n_splits)

print('Using SVD')
algo = SVD()

for i, (trainset, testset) in enumerate(kf.split(data)):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    acc = accuracy.rmse(predictions, verbose=True)
    print(i, acc)
    accuracy_mean += acc

accuracy_mean /= n_splits
print('Mean:', accuracy_mean)

Using SVD
RMSE: 0.8745
0 0.8744693496219872
RMSE: 0.8752
1 0.8752019665489634
RMSE: 0.8724
2 0.8724273870804884
RMSE: 0.8758
3 0.8757972129381439
RMSE: 0.8743
4 0.8743227217859987
Mean: 0.8744437275951162


In [19]:
from surprise.prediction_algorithms.baseline_only import BaselineOnly
from surprise.model_selection import KFold

accuracy_mean = 0
n_splits = 5
# define a cross-validation iterator
kf = KFold(n_splits=n_splits)

print('Using SGD')
bsl_options = {
    'method': 'sgd',
    'learning_rate': .00005,
}
algo = BaselineOnly(bsl_options=bsl_options)

for i, (trainset, testset) in enumerate(kf.split(data)):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    acc = accuracy.rmse(predictions, verbose=True)
    print(i, acc)
    accuracy_mean += acc

accuracy_mean /= n_splits
print('Mean:', accuracy_mean)

Using SGD
Estimating biases using sgd...
RMSE: 1.0162
0 1.0161642249330074
Estimating biases using sgd...
RMSE: 1.0132
1 1.0132441783199726
Estimating biases using sgd...
RMSE: 1.0097
2 1.0096992832999658
Estimating biases using sgd...
RMSE: 1.0129
3 1.0129276167816197
Estimating biases using sgd...
RMSE: 1.0108
4 1.010756109150782
Mean: 1.0125582824970696


In [20]:
from surprise.prediction_algorithms.baseline_only import BaselineOnly
from surprise.model_selection import KFold

accuracy_mean = 0
n_splits = 5
# define a cross-validation iterator
kf = KFold(n_splits=n_splits)

print('Using ALS')
bsl_options = {
    'method': 'als',
    'n_epochs': 5,
    'reg_u': 12,
    'reg_i': 5,
}
algo = BaselineOnly(bsl_options=bsl_options)

for i, (trainset, testset) in enumerate(kf.split(data)):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    acc = accuracy.rmse(predictions, verbose=True)
    print(i, acc)
    accuracy_mean += acc

accuracy_mean /= n_splits
print('Mean:', accuracy_mean)

Using ALS
Estimating biases using als...
RMSE: 0.9070
0 0.9069707917105925
Estimating biases using als...
RMSE: 0.9085
1 0.9084585412757212
Estimating biases using als...
RMSE: 0.9097
2 0.9096534089168071
Estimating biases using als...
RMSE: 0.9084
3 0.908409994374723
Estimating biases using als...
RMSE: 0.9065
4 0.9064945082010328
Mean: 0.9079974488957753
