In [109]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import collections
import json

In [110]:
# 读取和处理数据
data_path = "/Users/ruilinwang/Desktop/recsys/"
df_user = pd.read_csv(data_path+'users.dat',
                     sep="::", header=None, engine="python",
                     names = "UserID::Gender::Age::Occupation::Zip-code".split("::"))

df_movie = pd.read_csv(data_path+'movies.dat',
                     sep="::", header=None, engine="python",
                     names = "MovieID::Title::Genres".split("::"))

df_rating = pd.read_csv(data_path+'ratings.dat',
                     sep="::", header=None, engine="python",
                     names = "UserID::MovieID::Rating::Timestamp".split("::"))

In [111]:
df_rating.to_csv("./recsys-web/resources/ratings.csv", index=False)

In [112]:
# 只取频率最高的item分类
# 计算电影中每个题材的次数
genre_count = collections.defaultdict(int)
for genres in df_movie["Genres"].str.split("|"):
    for genre in genres:
        genre_count[genre] += 1
genre_count

defaultdict(int,
            {'Animation': 105,
             "Children's": 251,
             'Comedy': 1200,
             'Adventure': 283,
             'Fantasy': 68,
             'Romance': 471,
             'Drama': 1603,
             'Action': 503,
             'Crime': 211,
             'Thriller': 492,
             'Horror': 343,
             'Sci-Fi': 276,
             'Documentary': 127,
             'War': 143,
             'Musical': 114,
             'Mystery': 106,
             'Film-Noir': 44,
             'Western': 68})

In [113]:
# 只保留最有代表性的题材
def get_highrate_genre(x):
    sub_values = {}
    for genre in x.split("|"):
        sub_values[genre] = genre_count[genre]
    return sorted(sub_values.items(), key=lambda x:x[1], reverse=True)[0][0]
df_movie["Genres"] = df_movie["Genres"].map(get_highrate_genre)

In [114]:
df_movie.sample(frac=1).head(3)

Unnamed: 0,MovieID,Title,Genres
1313,1333,"Birds, The (1963)",Horror
146,148,"Awfully Big Adventure, An (1995)",Drama
631,636,Frisk (1995),Drama


In [115]:
#给列新增数字索引列 目的是防止embedding过大
def add_index_column(param_df, column_name):
    values = list(param_df[column_name].unique())
    value_index_dict = {value:idx for idx,value in enumerate(values)}
    param_df[f"{column_name}_idx"] = param_df[column_name].map(value_index_dict)

In [116]:
add_index_column(df_user, "UserID")
add_index_column(df_user, "Gender")
add_index_column(df_user, "Age")
add_index_column(df_user, "Occupation")
add_index_column(df_movie, "MovieID")
add_index_column(df_movie, "Genres")

In [117]:
df_user.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code,UserID_idx,Gender_idx,Age_idx,Occupation_idx
0,1,F,1,10,48067,0,0,0,0
1,2,M,56,16,70072,1,1,1,1
2,3,M,25,15,55117,2,1,2,2
3,4,M,45,7,2460,3,1,3,3
4,5,M,25,20,55455,4,1,2,4


In [118]:
df_movie.to_csv("./recsys-web/resources/movies.csv", index=False)

In [119]:
df_movie.head()

Unnamed: 0,MovieID,Title,Genres,MovieID_idx,Genres_idx
0,1,Toy Story (1995),Comedy,0,0
1,2,Jumanji (1995),Adventure,1,1
2,3,Grumpier Old Men (1995),Comedy,2,0
3,4,Waiting to Exhale (1995),Drama,3,2
4,5,Father of the Bride Part II (1995),Comedy,4,0


In [120]:
# 合并成一个df
df = pd.merge(pd.merge(df_rating, df_user), df_movie)
df.drop(columns=["Timestamp", "Zip-code", "Title"], inplace=True)

In [121]:
df.sample(frac=1).head(3)

Unnamed: 0,UserID,MovieID,Rating,Gender,Age,Occupation,UserID_idx,Gender_idx,Age_idx,Occupation_idx,Genres,MovieID_idx,Genres_idx
260550,5208,2716,5,M,25,4,5207,1,2,12,Comedy,2647,0
792493,3401,3846,2,M,35,7,3400,1,5,3,Comedy,3776,0
597972,654,1783,3,M,50,7,653,1,4,3,Thriller,1725,5


In [122]:
num_users = df["UserID_idx"].max() + 1
num_movies = df["MovieID_idx"].max() + 1
num_genders = df["Gender_idx"].max() + 1
num_ages = df["Age_idx"].max() + 1
num_occupations = df["Occupation_idx"].max() + 1
num_genres = df["Genres_idx"].max() + 1

num_users, num_movies, num_genders, num_ages, num_occupations, num_genres

(6040, 3883, 2, 7, 21, 18)

In [123]:
# 评分的归一化
min_rating = df["Rating"].min()
max_rating = df["Rating"].max()

df["Rating"] = df["Rating"].map(lambda x : (x-min_rating)/(max_rating-min_rating))

df.sample(frac=1).head(3)

Unnamed: 0,UserID,MovieID,Rating,Gender,Age,Occupation,UserID_idx,Gender_idx,Age_idx,Occupation_idx,Genres,MovieID_idx,Genres_idx
979074,2102,613,0.25,M,25,11,2101,1,2,13,Drama,609,2
8558,2907,1287,1.0,F,35,5,2906,0,5,18,Drama,1267,2
947933,1746,2525,0.5,M,25,4,1745,1,2,12,Action,2456,3


In [124]:
# 构建训练数据集
df_sample = df.sample(frac=0.1)
X = df_sample[["UserID_idx","Gender_idx","Age_idx","Occupation_idx","MovieID_idx","Genres_idx"]]
y = df_sample.pop("Rating")

In [125]:
def get_model():
    """函数式API搭建双塔DNN模型"""
    
    # 输入
    user_id = keras.layers.Input(shape=(1,), name="user_id")
    gender = keras.layers.Input(shape=(1,), name="gender")
    age = keras.layers.Input(shape=(1,), name="age")
    occupation = keras.layers.Input(shape=(1,), name="occupation")
    movie_id = keras.layers.Input(shape=(1,), name="movie_id")
    genre = keras.layers.Input(shape=(1,), name="genre")
    
    # user 塔
    user_vector = tf.keras.layers.concatenate([
            layers.Embedding(num_users, 100)(user_id), 
            layers.Embedding(num_genders, 2)(gender), 
            layers.Embedding(num_ages, 2)(age), 
            layers.Embedding(num_occupations, 2)(occupation)
    ])
    user_vector = layers.Dense(32, activation='relu')(user_vector)
    user_vector = layers.Dense(8, activation='relu', 
                               name="user_embedding", kernel_regularizer='l2')(user_vector)

    # movie塔
    movie_vector = tf.keras.layers.concatenate([
        layers.Embedding(num_movies, 100)(movie_id),
        layers.Embedding(num_genres, 2)(genre)
    ])
    movie_vector = layers.Dense(32, activation='relu')(movie_vector)
    movie_vector = layers.Dense(8, activation='relu', 
                                name="movie_embedding", kernel_regularizer='l2')(movie_vector)

    # 每个用户的embedding和item的embedding作点积
    dot_user_movie = tf.reduce_sum(user_vector*movie_vector, axis = 1)
    dot_user_movie = tf.expand_dims(dot_user_movie, 1)

    output = layers.Dense(1, activation='sigmoid')(dot_user_movie)
    
    return keras.models.Model(inputs=[user_id, gender, age, occupation, movie_id, genre], outputs=[output])

In [97]:
model = get_model()
model.compile(loss=tf.keras.losses.MeanSquaredError(), 
              optimizer=keras.optimizers.RMSprop())

In [98]:
fit_x_train = [
        X["UserID_idx"], 
        X["Gender_idx"],
        X["Age_idx"],
        X["Occupation_idx"],
        X["MovieID_idx"],
        X["Genres_idx"]
    ]

from datetime import datetime
TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now())
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs/logs_"+TIMESTAMP)

history = model.fit(
    x=fit_x_train,
    y=y,
    batch_size=32,
    epochs=5,
    verbose=1,
    callbacks=[tensorboard_callback]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [99]:
# 模型的预估-predict
inputs = df.sample(frac=1.0)[
    ["UserID_idx","Gender_idx","Age_idx","Occupation_idx","MovieID_idx", "Genres_idx"]].head(10)

In [100]:
# 对于（用户ID，召回的电影ID列表），计算分数
model.predict([
        inputs["UserID_idx"], 
        inputs["Gender_idx"],
        inputs["Age_idx"],
        inputs["Occupation_idx"],
        inputs["MovieID_idx"],
        inputs["Genres_idx"]
    ])

array([[[0.30857152]],

       [[0.6934257 ]],

       [[0.8471228 ]],

       [[0.65442336]],

       [[0.8043971 ]],

       [[0.47179425]],

       [[0.87422776]],

       [[0.7520877 ]],

       [[0.7441046 ]],

       [[0.55641824]]], dtype=float32)

In [101]:
tf.saved_model.save(model,"./rmodels") # user_embedding_model定义模型输入
#model.save("./datas/ml-latest-small/tensorflow_two_tower.h5")

INFO:tensorflow:Assets written to: ./rmodels/assets


In [102]:
new_model = tf.keras.models.load_model("./rmodels")

In [103]:
# 对于（用户ID，召回的电影ID列表），计算分数
new_model.predict([
        inputs["UserID_idx"], 
        inputs["Gender_idx"],
        inputs["Age_idx"],
        inputs["Occupation_idx"],
        inputs["MovieID_idx"],
        inputs["Genres_idx"]
    ])

array([[[0.30857152]],

       [[0.6934257 ]],

       [[0.8471228 ]],

       [[0.65442336]],

       [[0.8043971 ]],

       [[0.47179425]],

       [[0.87422776]],

       [[0.7520877 ]],

       [[0.7441046 ]],

       [[0.55641824]]], dtype=float32)

In [104]:
# 保存模型的embedding可用于召回

In [105]:
# 得到user embedding

In [106]:
model.input

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'user_id')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'gender')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'age')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'occupation')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'movie_id')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'genre')>]

In [23]:
user_layer_model = keras.models.Model(
    inputs=[model.input[0], model.input[1], model.input[2], model.input[3]],
    outputs=model.get_layer("user_embedding").output
)

In [24]:
user_embeddings = []
for index, row in df_user.iterrows():
    user_id = row["UserID"]
    user_input = [
        np.reshape(row["UserID_idx"], [1,1]),
        np.reshape(row["Gender_idx"], [1,1]),
        np.reshape(row["Age_idx"], [1,1]),
        np.reshape(row["Occupation_idx"], [1,1])
    ]
    user_embedding = user_layer_model(user_input)
    
    embedding_str = ",".join([str(x) for x in user_embedding.numpy().flatten()])
    user_embeddings.append([user_id, embedding_str])

In [108]:
df_user_embedding = pd.DataFrame(user_embeddings, columns = ["user_id", "user_embedding"])
#df_user_embedding["user_embedding"] = df_user_embedding["user_embedding"].map(lambda x : np.array(json.loads(x)))
#df_user_embedding3 = dict(zip(df_user_embedding['user_id'], df_user_embedding['user_embedding']))
#print(type(df_user_embedding3))
#df_user_embedding1 = json.dumps(df_user_embedding)
#df1 = df_user_embedding.values.tolist()
#print(df_user_embedding1)

In [68]:
output = "./recsys-web/resources/user_embedding.csv"
df1.to_csv(output, index=False)

AttributeError: 'list' object has no attribute 'to_csv'

In [34]:
# 得到movie embedding

In [35]:
model.input

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'user_id')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'gender')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'age')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'occupation')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'movie_id')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'genre')>]

In [36]:
movie_layer_model = keras.models.Model(
    inputs=[model.input[4], model.input[5]],
    outputs=model.get_layer("movie_embedding").output
)

In [37]:
movie_embeddings = []
for index, row in df_movie.iterrows():
    movie_id = row["MovieID"]
    movie_input = [
        np.reshape(row["MovieID_idx"], [1,1]),
        np.reshape(row["Genres_idx"], [1,1])
    ]
    movie_embedding = movie_layer_model(movie_input)
    
    embedding_str = ",".join([str(x) for x in movie_embedding.numpy().flatten()])
    movie_embeddings.append([movie_id, embedding_str])

In [107]:
df_movie_embedding = pd.DataFrame(movie_embeddings, columns = ["movie_id", "movie_embedding"])
df_movie_embedding.head()

NameError: name 'movie_embeddings' is not defined

In [39]:
output = "./recsys-web/resources/item_embedding.csv"
df_movie_embedding.to_csv(output, index=False)

In [73]:
data_pathss = "/Users/ruilinwang/Desktop/recsys/recsys-web/resources/"
df_dict = pd.read_csv(data_pathss+'item_embedding.csv')

In [76]:
print(df_dict['features'])

0       [0.25866490602493286, 0.3560594320297241, 0.15...
1       [0.12449632585048676, -0.29282501339912415, -0...
2       [0.9557555317878723, 0.6764761805534363, 0.114...
3       [0.3184879720211029, 0.6365472078323364, 0.596...
4       [0.45523127913475037, 0.34402626752853394, -0....
                              ...                        
3701    [1.0018610954284668, 0.014720896258950233, -0....
3702    [0.13435769081115723, 0.5507825613021851, -0.5...
3703    [0.6164957284927368, 0.5210800766944885, 0.844...
3704    [0.260995477437973, 0.3665539026260376, 0.1523...
3705    [0.5620983839035034, 0.4703512191772461, 0.047...
Name: features, Length: 3706, dtype: object
