In [26]:
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.base import BaseEstimator

%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Откроем датасет и проверим данные

In [18]:
data = pd.read_json('Video_Games_5.json', lines=True)

In [19]:
data.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5,True,"10 17, 2015",A1HP7NVNPFMA4N,700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,1445040000,,,
1,4,False,"07 27, 2015",A1JGAP0185YJI6,700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",1437955200,,,
2,3,True,"02 23, 2015",A1YJWEXHQBWK2B,700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,,,
3,2,True,"02 20, 2015",A2204E1TH211HT,700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,1424390400,,,
4,5,True,"12 25, 2014",A2RF5B5H74JLPE,700026657,jon,"great game, I love it and have played it since...",love this game,1419465600,,,


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497577 entries, 0 to 497576
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   overall         497577 non-null  int64 
 1   verified        497577 non-null  bool  
 2   reviewTime      497577 non-null  object
 3   reviewerID      497577 non-null  object
 4   asin            497577 non-null  object
 5   reviewerName    497501 non-null  object
 6   reviewText      497419 non-null  object
 7   summary         497468 non-null  object
 8   unixReviewTime  497577 non-null  int64 
 9   vote            107793 non-null  object
 10  style           289237 non-null  object
 11  image           3634 non-null    object
dtypes: bool(1), int64(2), object(9)
memory usage: 42.2+ MB


In [21]:
data.describe()

Unnamed: 0,overall,unixReviewTime
count,497577.0,497577.0
mean,4.220456,1367848000.0
std,1.185424,122411300.0
min,1.0,939859200.0
25%,4.0,1316563000.0
50%,5.0,1410221000.0
75%,5.0,1452384000.0
max,5.0,1538438000.0


In [22]:
data['asin'].value_counts()

B00178630A    1381
B000ZK9QCS     905
B000XJNTNS     834
B00JK00S0S     783
B000ZKA0J6     774
              ... 
B0017GLJWY       1
B0017YHYFM       1
B000W2UJ2A       1
B000YI5BZC       1
B000XJD33E       1
Name: asin, Length: 17408, dtype: int64

In [23]:
data = data[['overall','reviewerID','asin','reviewTime']]
data.head()

Unnamed: 0,overall,reviewerID,asin,reviewTime
0,5,A1HP7NVNPFMA4N,700026657,"10 17, 2015"
1,4,A1JGAP0185YJI6,700026657,"07 27, 2015"
2,3,A1YJWEXHQBWK2B,700026657,"02 23, 2015"
3,2,A2204E1TH211HT,700026657,"02 20, 2015"
4,5,A2RF5B5H74JLPE,700026657,"12 25, 2014"


## разобьем данные на тестовую и тренировочную выборки

In [24]:
rmse = lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred))

def train_test_split(X, ratio=0.2, user_col='reviewerID', item_col='asin',
                     rating_col='overall', time_col='reviewTime'):
    # сортируем оценки по времени
    X.sort_values(by=[time_col], inplace=True)
    # список всех юзеров
    userIds = X[user_col].unique()
    X_train_data = []
    X_test_data = []
    y_train = []
    y_test = []
    for userId in tqdm_notebook(userIds):
        curUser = X[X[user_col] == userId]
        # определяем позицию, по которой делим выборку и размещаем данные по массивам
        idx = int(curUser.shape[0] * (1 - ratio))
        X_train_data.append(curUser[[user_col, item_col]].iloc[:idx, :].values)
        X_test_data.append(curUser[[user_col, item_col]].iloc[idx:, :].values)
        y_train.append(curUser[rating_col].values[:idx])
        y_test.append(curUser[rating_col].values[idx:])
    # cтекуем данные по каждому пользователю в общие массивы
    X_train = pd.DataFrame(np.vstack(X_train_data), columns=[user_col, item_col])
    X_test = pd.DataFrame(np.vstack(X_test_data), columns=[user_col, item_col])
    y_train = np.hstack(y_train)
    y_test = np.hstack(y_test)
    return X_train, X_test, y_train, y_test

In [27]:
X_train, X_test, y_train, y_test = train_test_split(data)

  0%|          | 0/55223 [00:00<?, ?it/s]

In [28]:
X_train.shape, len(y_train), X_test.shape, len(y_test)

((378280, 2), 378280, (119297, 2), 119297)

In [29]:
X_train.head()

Unnamed: 0,reviewerID,asin
0,A261TLAGXR52NH,B00002CF96
1,A261TLAGXR52NH,B00002CF8U
2,A261TLAGXR52NH,B00002CF8V
3,A261TLAGXR52NH,B0000296ZD
4,A2ZESFCRJL7YA0,B00002CF9M


In [30]:
y_train

array([4, 5, 4, ..., 5, 5, 5], dtype=int64)

## Обучение с помощью Item-based model

In [39]:
class ItemBased(BaseEstimator):
    def fit(self, X, y, user_col='reviewerID', item_col='asin'):
        X = X.copy()
        # сохраним текущих пользователей и имеющиеся предметы
        self.users = X[user_col].unique()
        self.items = X[item_col].unique()

        X['y'] = y
        # рассчитаем среднее значение рейтинга для пользователя и предмета
        self.mean_y_user = X.groupby(user_col)['y'].mean()
        self.mean_y_item = X.groupby(item_col)['y'].mean()

        # вычитаем среднюю оценку предмета
        X['y'] -= X[item_col].apply(lambda x: self.mean_y_item[x])

        # создаём векторы для каждого фильма с оценками пользователя
        # если пользователь не поставил оценку, то ставим 0
        self.item_ratings = pd.pivot_table(X, values='y', index=item_col,
                                           columns=user_col, fill_value=0)

        # считаем попарную схожесть между фильмами
        self.item_sim = cosine_similarity(self.item_ratings)

        # также сделаем словарь {значение item_col: index в item_ratings}
        self.item_pos = dict()
        for item in self.items:
            self.item_pos[item] = np.argwhere(self.item_ratings.index.values == item)[0][0]
        return self

    def predict_rating(self, pr_user, pr_item):
        # если в обучающей выборке нет такого предмета
        # или пользователя, то вернём 0
        if not pr_item in self.items or not pr_user in self.users:
            return 0

        # считаем числитель и знаменатель дроби из формулы предсказания
        numerator = self.item_sim[self.item_pos[pr_item]].dot(
                        self.item_ratings.loc[:, pr_user])
        # вычитаем 1, так как схожесть предмета с самим собой равна 1,
        # но модель не должна это учитывать
        denominator = np.abs(self.item_sim[self.item_pos[pr_item]]).sum() - 1

        return self.mean_y_item[pr_item] + numerator / denominator

    def predict(self, X, user_col='reviewerID', item_col='asin'):
        y = X[[user_col, item_col]].apply(lambda row: self.predict_rating(row[0], row[1]), axis=1)
        return y

In [40]:
%%time
print('start fitting...')
ib = ItemBased().fit(X_train, y_train)
print('start predicting...')
print('rmse = {}'.format(rmse(y_test, ib.predict(X_test))))

start fitting...
start predicting...
rmse = 1.1523725852084372
Wall time: 26min 21s


## Результат:
rmse = 1.1523725852084372