## 训练

In [37]:
import pandas as pd
import numpy as np
from keras import Model, regularizers
import keras.backend as K
from keras.layers import Embedding, Reshape, Input, Dense, Dot
from keras.models import load_model
from keras import initializers
from sklearn.utils import shuffle

K.clear_session()


def Recmand_model(num_user, num_movie, embedding_size):
    # 输入层
    input_uer = Input(shape=[None, ], dtype="int32")
    input_movie = Input(shape=[None, ], dtype="int32")
    
    # 嵌入层
    # input_dim: int > 0。词汇表大小， 即，最大整数 index + 1。
    # 看keras常见API那个文件，里面介绍了这个Embedding这个的详细参数
    user_embedding = Embedding(num_user + 1, embedding_size, input_length=1)(input_uer)
    # 加正则
    # user_embedding = Embedding(num_user + 1, embedding_size, input_length=1, embeddings_initializer=initializers.random_normal(stddev=0.01), embeddings_regularizer=regularizers.l2(0.01))(input_uer)    
    user_embedding = Reshape((embedding_size,))(user_embedding)
    
    movie_embedding = Embedding(num_movie + 1, embedding_size, input_length=1)(input_movie)
    movie_embedding = Reshape((embedding_size,))(movie_embedding)
    
    # 输出层
    out = Dot(1)([user_embedding, movie_embedding])
    
    model = Model(inputs=[input_uer, input_movie], outputs=out)
    model.compile(loss='mse', optimizer='Adam', metrics=['accuracy'])
    model.summary()
    return model


def split_data(df):
    df.sort_values(by=['time'], inplace=True)  # 按时间排序
    boundary = df['time'].quantile(.9)  # 按时间划分 分界线
    train = df[df['time'] < boundary]
    train.sort_values(by=['user', 'time'], axis=0, inplace=True)
    test = df[df['time'] >= boundary]
    test.sort_values(by=['user', 'time'], axis=0, inplace=True)
    # shuffle 数据集
    return shuffle(train), shuffle(test)


def load_data(path):
    dformat = ['user', 'item', 'rating', 'time']
    rating = pd.read_csv(path, sep="::", header=None, names=dformat)
    train_rating, test_rating = split_data(rating)
    num_user = np.max(train_rating["user"])
    num_movie = np.max(train_rating["item"])
    print("num_user: {} num_movie: {}".format(num_user, num_movie))
    print("train: {} test: {}".format(len(train_rating), len(test_rating)))

    train_user, train_movie = train_rating['user'].values, train_rating['item'].values
    test_user, test_movie = test_rating['user'].values, test_rating['item'].values

    train_x = [train_user, train_movie]
    train_y = train_rating['rating'].values
    test_x = [test_user, test_movie]
    test_y = test_rating['rating'].values
    return num_user, num_movie, train_x, train_y, test_x, test_y


def train(num_user, num_movie, train_x, train_y, model_save_path, batch_size=128, epochs=5, embedding_size=100):
    model = Recmand_model(num_user, num_movie, embedding_size)
    model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs)
    model.save(model_save_path)
    return model


def evaluate(model, test_x, test_y):
    eval_ = model.evaluate(test_x, test_y, verbose=0)
    print("Evaluation on test data: loss = %0.6f accuracy = %0.2f%%" % (eval_[0], eval_[1] * 100))


def predict(model, input_x):
    pred = model.predict(input_x)
    print("pred = {}".format(pred))


if __name__ == '__main__':
    data_path = "./data/ratings.dat"
    model_save_path = './model/mf_model.h5'

    num_user, num_movie, train_x, train_y, test_x, test_y = load_data(data_path)

    model = train(num_user, num_movie, train_x, train_y, model_save_path)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


num_user: 6040 num_movie: 3952
train: 900188 test: 100021
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 100)       604100      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 100)       395300      input_2[0][0]                    
___________________________________________________

## 验证

In [36]:
evaluate(model, test_x, test_y)

Evaluation on test data: loss = 1.560601 accuracy = 37.72%


## 加载模型

In [26]:
model = load_model(model_save_path)

## 预测

In [27]:
test_x

[array([   1,    1,    1, ..., 6040, 6040, 6040]),
 array([3186, 1721, 1270, ..., 1784,  161, 1221])]

In [28]:
test_y[-1]

4

In [29]:
test_y[-2]

3

In [21]:
input_x = [np.array([6040]), np.array([1221])]
predict(model, input_x)

pred = [[4.366831]]


In [22]:
input_x = [np.array([6040]), np.array([161])]
predict(model, input_x)

pred = [[3.1422122]]
