## Домашнее задание

Вам нужно написать следующий пайплайн:
1. Получить эмбединги пользователей с помощью SVD.
2. Подать их в нейронную сеть, которую вы делали на практическом занятии.
Она принимает на вход пользователей и эмбединг айтомов, далее объединяет их в два полносвязанных слоя. На последнем слое выдаёт прогноз с функцией активации сигмоид.
3. Обучить сеть.


In [1]:
### YOUR CODE HERE ###
import pandas as pd
import numpy as np
import tensorflow as tf
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from pathlib import Path
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.models import Model
from tensorflow.keras.utils import plot_model
from keras.layers import Input, Dense

In [2]:
data = Dataset.load_builtin('ml-100k')
# Подбор оптимального n_factors
param_grid = {'n_factors': [2, 5, 10, 15, 20, 30, 40, 50, 70, 100]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=-1)
gs.fit(data)
n_factors = gs.best_params['rmse']['n_factors']
print(
    f"Наименьшая RMSE достигается при числе факторов {n_factors} и равна: {gs.best_score['rmse']}")

Наименьшая RMSE достигается при числе факторов 30 и равна: 0.9335589454674162


In [3]:
df = pd.DataFrame(data.raw_ratings, columns = ['userID', 'itemID', 'rating',  '_']).iloc[:, :-1]
df = df.astype({'userID': 'int64', 'itemID': 'int64', 'rating': 'int64'})
df.head()

Unnamed: 0,userID,itemID,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [4]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

In [5]:
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

In [6]:
# Функция подготовки эмбеддингов
def emb_data(df, n_factors):
    X = []
    y = []
    algo = SVD(n_factors=n_factors,random_state=17, verbose=False).fit(trainset)
    for row in df.itertuples():
        user_id = row[1]
        item_id = row[2]
        rating = row[3]
        user_emb = algo.pu[user_id - 1]
        item_emb = algo.qi[item_id - 1]
        X.append(np.concatenate((user_emb, item_emb)))
        if rating > 3:
            y.append(1)
        else:
            y.append(0)
    return np.array(X), np.array(y)


X, y = emb_data(df, n_factors)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
# Model
def get_model(input_dim):

    input = tf.keras.layers.Input(input_dim,)
    layer = tf.keras.layers.Dense(256, activation='relu')(input)
    layer = tf.keras.layers.Dropout(0.1)(layer)
    layer = tf.keras.layers.Dense(128, activation='relu')(layer)
    layer = tf.keras.layers.Dropout(0.1)(layer)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(layer)
    
    model = tf.keras.Model(inputs = input, outputs = output)
    
    model.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=["accuracy"],
                  )
    return model
    
model = get_model(n_factors*2)
early_stopper = EarlyStopping(monitor='accuracy', min_delta=0, patience=5, mode='auto')
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2,
                              patience=5, min_lr=1e-5)

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 60)]              0         
                                                                 
 dense (Dense)               (None, 256)               15616     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 48,641
Trainable params: 48,641
Non-trainable p

In [9]:
hist = model.fit(X_train, y_train,
    batch_size=128,
    epochs=20,
    shuffle=True,
    validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
