In [1]:
import pandas as pd
import numpy as np
from keras.layers import Embedding
import re
from keras import Input, layers,models
from keras import backend as K
from keras.datasets import imdb
from keras import preprocessing
import keras
import pickle
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


((array([[[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       ...,

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ...

## 数据初步了解

- UserID、Occupation和MovieID不用变。
- Gender字段：需要将‘F’和‘M’转换成0和1。
- Age字段：要转成7个连续数字0~6。
- Genres字段：是分类字段，要转成数字。首先将Genres中的类别转成字符串到数字的字典，然后再将每个电影的Genres字段转成数字列表，因为有些电影是多个Genres的组合。
- Title字段：处理方式跟Genres字段一样，首先创建文本到数字的字典，然后将Title中的描述转成数字的列表。另外Title中的年份也需要去掉。
- Genres和Title字段需要将长度统一，这样在神经网络中方便处理。空白部分用‘< PAD >’对应的数字填充。

### 对于User数据

 - Gender is denoted by a “M” for male and “F” for female
 - Age is chosen from the following ranges:  
   -1: “Under 18”  
   -18: “18-24”  
   -25: “25-34”  
   -35: “35-44”  
   -45: “45-49”  
   -50: “50-55”  
   -56: “56+”  
 - Occupation is chosen from the following choices:
   - 全是职业

### 对于电影数据
- MovieID  电影ID
- Title   电影名
- Genres  电影类型

## 对于评分数据
- UserIDs range between 1 and 6040
- MovieIDs range between 1 and 3952
- Ratings are made on a 5-star scale (whole-star ratings only)
- Timestamp is represented in seconds since the epoch as returned by time(2)
- Each user has at least 20 ratings 

## 数据预处理函数

In [2]:
def load_data():
    user_data=pd.read_csv(r'./movieLen/ml-1m/users.dat',header=None,sep='::',names=['UserID','Gender','Age','Occupation','Zip-code'])
    movie_data=pd.read_csv(r'./movieLen/ml-1m/movies.dat',header=None,sep='::',names=['MovieID','Title','Genres'])
    rating_data=pd.read_csv(r'./movieLen/ml-1m/ratings.dat',sep='::',names=['UserID','MovieID','Rating','Timestamp'])
    user_origin_data=user_data.values
    movie_origin_data=movie_data.values
    rating_origin_data=rating_data.values
    Gender_map={'F':0,'M':1}
    user_data['Gender']=user_data['Gender'].map(Gender_map)
    age_map = {val:ii for ii,val in enumerate(set(user_data['Age']))}
    user_data['Age']=user_data['Age'].map(age_map)
    
    # 将电影Title的年份去掉
    pattern = re.compile(r'^(.*)\((\d+)\)$')
    title_map = {val:pattern.match(val).group(1) for ii,val in enumerate(set(movie_data['Title']))}
    movie_data['Title'] = movie_data['Title'].map(title_map)
    
    #电影类型转数字字典
    genres_set = set()
    for val in movie_data['Genres'].str.split('|'):
        genres_set.update(val)
    genres_set.add('<PAD>')
    genres2int = {val:ii for ii, val in enumerate(genres_set)}
    
    genres_map = {val:[genres2int[row] for row in val.split('|')] for ii,val in enumerate(set(movie_data['Genres']))}
    for key in genres_map:
        for cnt in range(max(genres2int.values()) - len(genres_map[key])):
            genres_map[key].insert(len(genres_map[key]) + cnt,genres2int['<PAD>'])
    movie_data['Genres'] = movie_data['Genres'].map(genres_map)
    title_set = set()
    for val in movie_data['Title'].str.split():
        title_set.update(val)
    
    title_set.add('<PAD>')
    title2int = {val:ii for ii, val in enumerate(title_set)}
    title_count = 15
    title_map = {val:[title2int[row] for row in val.split()] for ii,val in enumerate(set(movie_data['Title']))}
    for key in title_map:
        for cnt in range(title_count - len(title_map[key])):
            title_map[key].insert(len(title_map[key]) + cnt,title2int['<PAD>'])
    movie_data['Title'] = movie_data['Title'].map(title_map)
    data = pd.merge(pd.merge(rating_data, user_data), movie_data)
    
    target_fields = ['Rating']
    features_pd, targets_pd = data.drop(target_fields, axis=1), data[target_fields]
    features = features_pd.values
    targets_values = targets_pd.values
    
    return title_count, title_set, genres2int, features, targets_values, rating_data, user_data, movie_data, data, movie_origin_data, user_origin_data

In [3]:
title_count, title_set, genres2int, features, targets_values, ratings, user_data, movie_data, data, movie_origin_data, user_origin_data=load_data()

## 网络超参

In [4]:
#嵌入矩阵的维度
embed_dim = 32
# UserId 最大max
user_id_max=max(data.UserID)+1
# User性别类型数量
user_gender_max=max(data.Gender)+1
# User年龄类型数量
user_age_max=max(data.Age)+1
# User职业类别数量
user_occupation=max(data.Occupation)+1
# MovieId 最大max
movie_id_max =max(data.MovieID)+1
# Movie类别数量
movie_categories_max =max(genres2int.values())+1
# 电影名里词数量
movie_title_max = len(title_set) # 5217
# 电影类型数量
movie_categories_max =max(genres2int.values())+1

#电影名长度
sentences_size =title_count 

# 卷积核长度
window_sizes =7

# 池化长度
pool_size=title_count-window_sizes+1

#电影ID转下标的字典，数据集中电影ID跟下标不一致，比如第5行的数据电影ID不一定是5
movieid2idx = {val[0]:i for i, val in enumerate(movie_origin_data)}

## 电影名文本卷积网络

In [5]:
# 长度为15
movie_title_input = Input(shape=(sentences_size,), dtype='int32', name='movie_title') 
#嵌入到32维空间
embedded_text = layers.Embedding(movie_title_max, embed_dim,)(movie_title_input) 

In [6]:
#做一卷积，卷积步幅为7
conv1D_movie_feature=layers.Conv1D(embed_dim,window_sizes,activation='relu')(embedded_text)
maxpool_movie_feature=layers.MaxPool1D((pool_size))(conv1D_movie_feature)
movie_title_vec= layers.Flatten(name='FlattenMovieTitle')(maxpool_movie_feature)

## 电影ID

In [7]:
#电影ID嵌入
movie_Id_input = Input(shape=(1,), dtype='int32', name='movie_id') 
movie_Id_embedding=layers.Embedding(movie_id_max, embed_dim,)(movie_Id_input)
movoe_Id_vec=layers.Flatten(name='FlattenMovieId')(movie_Id_embedding)

# 电影类型ID

In [8]:
ReduceSum = layers.core.Lambda(lambda z: K.sum(z, axis=1))
movie_genres_input = Input(shape=(max(genres2int.values()),), dtype='int32', name='movie_genres')
movie_genres=layers.Embedding(movie_categories_max,embed_dim,)(movie_genres_input)
#将多个Embedding的类型相加
movie_genres_add=ReduceSum(movie_genres)

# 将电影类型和ID concat起来

In [9]:
#电影ID 和电影类型上各接一个32维的Dense层
movie_Id_dense=layers.Dense(embed_dim, name='movie_id_dense')(movoe_Id_vec)
movie_genres_dense=layers.Dense(embed_dim,name='movie_genres_dense')(movie_genres_add)

In [10]:
#  将上面两层和电影名concat起来
concatenated = layers.concatenate([movie_Id_dense, movie_genres_dense,movie_title_vec],axis=1) 

In [11]:
# 在上面接一个200神经元的Dense层
movie_layers=layers.Dense(200,name='movie_feature',activation='tanh')(concatenated)

# User特征

## Id

In [12]:
user_Id_input = Input(shape=(1,), dtype='int32', name='user_id') 
user_Id_embedding=layers.Embedding(user_id_max, embed_dim,)(user_Id_input)
user_Id_vec=layers.Flatten(name='FlattenUserId')(user_Id_embedding)

## 性别

In [13]:
user_Gender_input = Input(shape=(1,), dtype='int32', name='user_gender') 
user_Gender_embedding=layers.Embedding(user_gender_max, embed_dim,)(user_Gender_input)
user_Gender_vec=layers.Flatten(name='FlattenUserGender')(user_Gender_embedding)

## 年龄

In [14]:
user_Age_input = Input(shape=(1,), dtype='int32', name='user_age') 
user_Age_embedding=layers.Embedding(user_age_max, embed_dim,)(user_Age_input)
user_Age_vec=layers.Flatten(name='FlattenUserAge')(user_Age_embedding)

## 职业

In [15]:
user_Occupation_input = Input(shape=(1,), dtype='int32', name='user_Occupation') 
user_Occupation_embedding=layers.Embedding(user_occupation, embed_dim,)(user_Occupation_input)
user_Occupation_vec=layers.Flatten(name='FlattenUserOccupation')(user_Occupation_embedding)

# 第一层全连接

In [16]:
# 第一层全连接
user_Gender_Dense=layers.Dense(embed_dim,name='user_gender_dense',activation='relu')(user_Gender_vec)
user_Id_Dense=layers.Dense(embed_dim,name='user_id_dense',activation='relu')(user_Id_vec)
user_Age_Dense=layers.Dense(embed_dim,name='user_age_dense',activation='relu')(user_Age_vec)
user_occupation=layers.Dense(embed_dim,name='user_occupation_dense',activation='relu')(user_Occupation_vec)

# 将上面各层拼接并接一个200神经元Dense层

In [17]:
user_concatenated = layers.concatenate([user_Gender_Dense, user_Id_Dense,user_Age_Dense,user_occupation],axis=1) 
# 第二层全连接
user_feature_Dense=layers.Dense(200,name='user_feature_Dense',activation='relu')(user_concatenated)

# 将User和Movie拼接起来

In [18]:
all_feature=layers.concatenate([user_feature_Dense,movie_layers],axis=1)
all_feature_Dense=layers.Dense(1,name='final_dense')(all_feature)

# UserId,User性别，User年龄，User职业，电影名，电影Id，电影类型

In [19]:
model=models.Model([user_Id_input,user_Gender_input,user_Age_input,user_Occupation_input,movie_title_input,movie_Id_input,movie_genres_input],all_feature_Dense)

In [20]:
[user_Id_input,user_Gender_input,user_Age_input,user_Occupation_input,movie_title_input,movie_Id_input,movie_genres_input]

[<tf.Tensor 'user_id:0' shape=(?, 1) dtype=int32>,
 <tf.Tensor 'user_gender:0' shape=(?, 1) dtype=int32>,
 <tf.Tensor 'user_age:0' shape=(?, 1) dtype=int32>,
 <tf.Tensor 'user_Occupation:0' shape=(?, 1) dtype=int32>,
 <tf.Tensor 'movie_title:0' shape=(?, 15) dtype=int32>,
 <tf.Tensor 'movie_id:0' shape=(?, 1) dtype=int32>,
 <tf.Tensor 'movie_genres:0' shape=(?, 18) dtype=int32>]

In [21]:
model.compile(optimizer='rmsprop',
 loss='mse',
 metrics=['mae'])

In [22]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
movie_title (InputLayer)        (None, 15)           0                                            
__________________________________________________________________________________________________
user_gender (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
user_id (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
user_age (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
user_Occup

# 输入数据

In [23]:
num_samples=features.shape[0]
user_id_data=features.take(0,1).reshape([num_samples,1])
user_Gender_data=features.take(3,1).reshape([num_samples,1])
user_Age_data=features.take(4,1).reshape([num_samples,1])
user_Occupation_data=features.take(5,1).reshape([num_samples,1])
movie_id_data=features.take(1,1).reshape([num_samples,1])
movie_Title_data=preprocessing.sequence.pad_sequences(features.take(7,1),maxlen=15)
movie_genres_data=preprocessing.sequence.pad_sequences(features.take(8,1),maxlen=18)

In [24]:
callbacks = [
 keras.callbacks.TensorBoard(
 log_dir='my_log_dir',
 histogram_freq=1,
 )
]

## 定义callback函数，检测数

# 正式训练

In [25]:
model.fit([user_id_data, user_Gender_data,user_Age_data,user_Occupation_data,movie_Title_data,movie_id_data,movie_genres_data], targets_values, epochs=10, batch_size=128,validation_split=0.2,callbacks=callbacks) 

Train on 800167 samples, validate on 200042 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f1aaf213a90>

In [27]:
model.save('myRecommondStstem.h5')

## 对用户和电影进行预测

In [28]:
def rating_movie(user_id, movie_id):
    user_id_val=user_id_data[user_id-1]
    user_Gender_val=user_Gender_data[user_id-1]
    user_Age_val=user_Age_data[user_id-1]
    user_Occupation_val=user_Occupation_data[user_id-1]
    movie_id_val=movie_id_data[movie_id]
#     movie_Title_val=movie_Title_data[movie_id]
#     movie_genres_val=movie_genres_data[movie_id]
    movie_genres_val=np.zeros([1, max(genres2int.values())])
    movie_Title_val=np.zeros([1, title_count])
    movie_Title_val[0]=movie_Title_data[movieid2idx[movie_id]]
    movie_genres_val[0]=movie_genres_data[movieid2idx[movie_id]]
    return model.predict([user_id_val,user_Gender_val,user_Age_val,user_Occupation_val,movie_Title_val,movie_id_val,movie_genres_val])

# 获取Movie特征

## 电影特征

In [37]:
movie_data.shape[0]

3883

In [42]:
movie_id_data_save=movie_data.values.take(0,1).reshape([movie_data.values.shape[0],1])
movie_Title_data_save=preprocessing.sequence.pad_sequences(movie_data.values.take(1,1),maxlen=title_count)
movie_genres_data_save=preprocessing.sequence.pad_sequences(movie_data.values.take(2,1),maxlen=18)

In [43]:
movie_feature_model=models.Model(input=[movie_title_input,movie_Id_input,movie_genres_input], output=[movie_layers])

In [44]:
movie_intermedian_output=movie_feature_model.predict([movie_Title_data_save,movie_id_data_save,movie_genres_data_save])

In [45]:
pickle.dump((np.array(movie_intermedian_output)), open('movie_matrics.p', 'wb'))

# 获取User特征

In [47]:
user_feature_model=models.Model(input=[user_Id_input,user_Gender_input,user_Age_input,user_Occupation_input], output=[user_feature_Dense])

In [50]:
user_id_data_save=user_data.values.take(0,1)
user_Gender_data_save=user_data.values.take(1,1)
user_Age_data_save=user_data.values.take(2,1)
user_Occupation_data_save=user_data.values.take(3,1)

In [247]:
user_intermedian_output=user_feature_model.predict([user_id_data_save, user_Gender_data_save,user_Age_data_save,user_Occupation_data_save])

In [248]:
pickle.dump((np.array(user_intermedian_output)), open('user_matrics.p', 'wb'))

# 开始做推荐

# 推荐同类型的电影

In [265]:
def recommend_same_type_movie(movie_id_val, top_k = 20):
    # 对movie 特征矩阵进行归一化
    norm_movie_matrics=np.sqrt(np.sum(movie_intermedian_output**2,axis=1))
    normalized_movie_matrics=movie_intermedian_output/norm_movie_matrics.reshape(norm_movie_matrics.shape[0],1)
    probs_embeddings = (normalized_movie_matrics[movieid2idx[movie_id_val]])
    probs_similarity = probs_embeddings.dot(normalized_movie_matrics.T)
    probs_similarity[np.argsort(probs_similarity)[:-top_k]] = 0
    probs_similarity = probs_similarity / np.sum(probs_similarity)
    results =set()
    print("您看的电影是：{}".format(movie_origin_data[movieid2idx[movie_id_val]]))
    while len(results)<top_k:
        c = np.random.choice(3883, 1, p=probs_similarity)[0]
        results.add(c)
    for val in (results):
        print(val)
        print(movie_origin_data[val])
    return results

In [266]:
recommend_same_type_movie(234,10)

231
[234 'Exit to Eden (1994)' 'Comedy']
2759
[2828 'Dudley Do-Right (1999)' "Children's|Comedy"]
2323
[2392 'Jack Frost (1998)' 'Comedy|Drama']
822
[833 'High School High (1996)' 'Comedy']
3705
[3774 'House Party 2 (1991)' 'Comedy']


{231, 822, 2323, 2759, 3705}

# 推荐您喜欢的电影

In [271]:
def recommend_your_favorite_movie(user_id_val, top_k = 10):
    probs_embeddings = (user_intermedian_output[user_id_val-1])
    probs_similarity = probs_embeddings.dot(movie_intermedian_output.T)
    probs_similarity[np.argsort(probs_similarity)[:-top_k]] = 0
    probs_similarity = probs_similarity / np.sum(probs_similarity)
    results =set()
    while len(results)<top_k:
        c = np.random.choice(3883, 1, p=probs_similarity)[0]
        results.add(c)
    for val in (results):
        print(val)
        print(movie_orgin_data[val])
    return results

In [272]:
recommend_your_favorite_movie(220)

257
[260 'Star Wars: Episode IV - A New Hope (1977)'
 'Action|Adventure|Fantasy|Sci-Fi']
740
[750
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)'
 'Sci-Fi|War']
711
[720 'Wallace & Gromit: The Best of Aardman Animation (1996)' 'Animation']
3113
[3182 'Mr. Death: The Rise and Fall of Fred A. Leuchter Jr. (1999)'
 'Documentary']
523
[527 "Schindler's List (1993)" 'Drama|War']
847
[858 'Godfather, The (1972)' 'Action|Crime|Drama']
49
[50 'Usual Suspects, The (1995)' 'Crime|Thriller']
893
[905 'It Happened One Night (1934)' 'Comedy']
1214
[1233 'Boat, The (Das Boot) (1981)' 'Action|Drama|War']
1183
[1201 'Good, The Bad and The Ugly, The (1966)' 'Action|Western']


{49, 257, 523, 711, 740, 847, 893, 1183, 1214, 3113}