In [1]:
import os, sys, re, pickle
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import tensorflow as tf
from tensorflow import keras
print(sys.version_info)

sys.version_info(major=3, minor=6, micro=7, releaselevel='final', serial=0)


In [2]:
for module in tf, mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

tensorflow 2.1.0
matplotlib 3.2.0
numpy 1.18.1
pandas 1.0.1
sklearn 0.22.2.post1
tensorflow 2.1.0
tensorflow_core.python.keras.api._v2.keras 2.2.4-tf


In [3]:
ORIGIN_DATA_DIR = os.getcwd() + '/all_fearures/BX-CSV-Dump/'
FILTERED_DATA_DIR = os.getcwd() + '/tmp/'

In [4]:
class DataLoad:
    def __init__(self):
        '''
        books_with_blurbs.csv cloumns: ISBN,text,Author,Year,Publisher,Blurb
        BX-Book-Ratings.csv cloumns: User-ID,ISBN,Book-Rating
        BX-Books.csv cloumns: ISBN,Book-text,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
        BX-Users.csv cloumns: User-ID,Location,Age
        '''
        self.BX_Users = self.load_origin('BX-Users')
        self.BX_Book_Ratings = self.load_origin('BX-Book-Ratings')
        self.Books = self.load_origin('books_with_blurbs', ',')
        #合并三个表
        self.features, self.ISBN2int, self.UserID2int, self.Users = self.get_features()
        self.labels = self.features.pop('Book-Rating')

    def load_origin(self, 
        filename: "根据文件名获取源文件，获取正确得columns、values等值", 
        sep: "因为源文件的分隔方式sep不同，所以通过传参改编分隔方式"="\";\"", 
        )->pd.DataFrame:
        '''
        获取原始数据，第一遍获取后将用pickle保存到本地，方便日后调用
        '''
        try:
            # 从缓存的文件夹FILTERED_DATA_DIR获取基本被过滤后的文件
            pickled_data = pickle.load(open(FILTERED_DATA_DIR+filename+'.p', mode='rb'))
            return pickled_data
        except FileNotFoundError:
            # 如果缓存的文件不存在或者没有，则在源目录ORIGIN_DATA_DIR获取
            all_fearures = pd.read_csv(ORIGIN_DATA_DIR+filename+'.csv', engine='python',sep=sep, encoding='utf-8')
            # \";\"  初始过滤的文件
            # ,      初始不需要过滤的文件
            data_dict = {"\";\"":self.filtrator(all_fearures), ',':all_fearures}
            # 因为没获得处理后的文件，所以我们在获取源文件后可以保存一下处理后的文件
            pickle.dump((data_dict[sep]), open(FILTERED_DATA_DIR+filename+'.p', 'wb'))
            return data_dict[sep]
        except UnicodeDecodeError as e:
            ''' 测试时经常会出现编码错误，如果尝试更换编码方式无效，可以将编码错误的部分位置重新复制粘贴就可以了，这里我们都默认UTF-8'''
            print('UnicodeDecodeError:',e)
        except pd.errors.ParserError as e:
            print("connect error|pandas Error: %s" % e)

    def filtrator(self, 
        f_data: "输入需要进行初步filter的数据"
        )->pd.DataFrame:
        '''
        源文件中的columns和各个值得第一列的第一个字符和最后一列的最后一个字符都带有双引号‘"’,需要将其filter,Location字段当用户Age为null的时候，末尾会有\";NULL字符串 ，直接用切片调整
        '''
        Nonetype_age = 0
        f_data = f_data.rename(columns={f_data.columns[0]:f_data.columns[0][1:], f_data.columns[-1]:f_data.columns[-1][:-1]})
        f_data[f_data.columns[0]] = f_data[f_data.columns[0]].map(lambda v:v[1:] if v!=None else Nonetype_age)
        f_data[f_data.columns[-1]] = f_data[f_data.columns[-1]].map(lambda v:v[:-1] if v!=None else Nonetype_age)
        try:
            f_data = f_data[f_data['Location'].notnull()][f_data[f_data['Location'].notnull()]['Location'].str.contains('\";NULL')]
            f_data['Location'] = f_data['Location'].map(lambda location:location[:-6])
        except:
            pass
        return f_data

    def get_features(self):
        '''
        获取整个数据集的所有features，并对每个文本字段作xxxxx
        User-ID、Location、ISBN、Book-Rating、Title、Author、Year、Publisher、Blurb
        '''
        try:
            # 从缓存的文件夹FILTERED_DATA_DIR获取features的文件
            all_fearures, ISBN2int, UserID2int, Users = pickle.load(open(FILTERED_DATA_DIR+'features.p', mode='rb'))
            return all_fearures, ISBN2int, UserID2int, Users
        except:
            # 将所有的数据组成features大表
            all_fearures = pd.merge(pd.merge(self.BX_Users, self.BX_Book_Ratings), self.Books)
            Users = all_fearures
            # 因为没获得处理后的文件，所以我们在获取源文件后可以保存一下处理后的文件
            # isbn2index userid2index
            all_fearures.pop('Age')
            all_fearures['Title'] = self.feature2int(all_fearures['Title'], 'text')
            all_fearures['Blurb'] = self.feature2int(all_fearures['Blurb'], 'text')
            all_fearures['ISBN'], ISBN2int = self.feature2int(all_fearures['ISBN'], 'word')
            all_fearures['Author'], X2int = self.feature2int(all_fearures['Author'], 'word')
            all_fearures['Publisher'], X2int = self.feature2int(all_fearures['Publisher'], 'word')
            all_fearures['Year'], X2int = self.feature2int(all_fearures['Year'], 'word')
            all_fearures['User-ID'], UserID2int  = self.feature2int(all_fearures['User-ID'], 'word')
            all_fearures['Location'] = self.feature2int(all_fearures['Location'], 'list')
            all_fearures['Book-Rating'] = all_fearures['Book-Rating'].astype('float32')
            pickle.dump((all_fearures, ISBN2int, UserID2int, Users), open(FILTERED_DATA_DIR+'features.p', 'wb'))
            return all_fearures, ISBN2int, UserID2int, Users

    def feature2int(self, 
        feature:'特征值',
        feature_type:'text/word/list'):
        '''
        将文本字段比如title、blurb只取英文单词，并用空格为分隔符，做成一个带index值的集合，并用index值表示各个单词，作为文本得表示
        '''
        pattern = re.compile(r'[^a-zA-Z]')
        filtered_map = {val:re.sub(pattern, ' ', str(val)) for ii,val in enumerate(set(feature)) }
        letter_filter = lambda feature:feature.map({val:re.sub(pattern, ' ', str(val)) for ii,val in enumerate(set(feature)) })
        text_words = set()
        filtered_feature = letter_filter(feature)
        for val in filtered_feature.str.split():
            text_words.update(val)
        text2int = {val:ii for ii, val in enumerate(text_words)}
        text_map = {val:[text2int[row] for row in filtered_map[val].split()][:200] for ii,val in enumerate(set(feature))}
        
        word_map = {val:ii for ii,val in enumerate(set(feature))}
        try:
            cities = set()
            for val in feature.str.split(','):
                cities.update(val)
            city2int = {val:ii for ii, val in enumerate(cities)}
            list_map = {val:[city2int[row] for row in val.split(',')][:3] for ii,val in enumerate(set(feature))}
        except AttributeError :
            list_map = {}

        feature_dict = {
            'text':(feature.map(text_map)),
            'word':(feature.map(word_map), word_map),
            'list':(feature.map(list_map)),
            }
        return feature_dict[feature_type]

    def __del__(self):
        pass

origin_DATA = DataLoad()

In [5]:
origin_DATA = DataLoad()

In [6]:
origin_DATA.features

Unnamed: 0,User-ID,Location,ISBN,Title,Author,Year,Publisher,Blurb
0,10848,"[233, 6958, 126]",27711,"[12058, 9896, 14641]",8308,63,939,"[118142, 80291, 32356, 126011, 41964, 89996, 3..."
1,10848,"[233, 6958, 126]",26782,"[1415, 16421, 23247, 20764, 12094, 7631, 22863...",9858,71,652,"[107313, 25552, 112071, 2624, 119018, 32356, 3..."
2,20979,"[685, 3135, 126]",26782,"[1415, 16421, 23247, 20764, 12094, 7631, 22863...",9858,71,652,"[107313, 25552, 112071, 2624, 119018, 32356, 3..."
3,17973,"[5771, 3020, 5545]",26782,"[1415, 16421, 23247, 20764, 12094, 7631, 22863...",9858,71,652,"[107313, 25552, 112071, 2624, 119018, 32356, 3..."
4,3664,"[3317, 5469, 5545]",26782,"[1415, 16421, 23247, 20764, 12094, 7631, 22863...",9858,71,652,"[107313, 25552, 112071, 2624, 119018, 32356, 3..."
...,...,...,...,...,...,...,...,...
172097,22010,"[7222, 523, 5545]",5218,"[4790, 20406, 16696, 8493, 3869, 2277, 8493, 3...",9355,70,2763,"[86013, 74054, 45479, 32356, 72651, 80940, 818..."
172098,23589,"[3673, 2439, 5545]",23125,"[4853, 12840, 2200, 21361, 16696, 18138, 14311...",13154,75,193,"[103017, 93229, 52369, 111950, 101746, 45479, ..."
172099,23589,"[3673, 2439, 5545]",14767,"[5478, 6884]",12470,75,2889,"[33424, 7818, 111090, 45479, 38754, 42924, 588..."
172100,23589,"[3673, 2439, 5545]",17882,"[11475, 4193, 10880, 9234]",12306,76,2366,"[43101, 88468, 31146, 10384, 5881, 72843, 1658..."


In [7]:
origin_DATA.features.shape

(172102, 8)

In [8]:
# user-id的字典,总共有28836个用户
all_user = len(set(origin_DATA.features['User-ID']))
new_user_id = {val: i for i, val in enumerate(set(origin_DATA.features['User-ID']))}
print('all user id = ', all_user)
# location的数量=7573(从0开始的)
all_location = max([j for i in origin_DATA.features.Location for j in i]) +1 
print('all location = ', all_location)

all user id =  28836
all location =  7574


In [9]:
# ISBN总数
all_isbn = len(set(origin_DATA.features['ISBN']))
print('all isbn = ', all_isbn)
# author总数
all_author = len(set(origin_DATA.features['Author']))
print('all author = ', all_author)
# year总数
all_year = len(set(origin_DATA.features['Year']))
print('all year = ', all_year)
# publish总数
all_publisher = len(set(origin_DATA.features['Publisher']))
print('all publisher = ', all_publisher)
# title中所有单词总数
all_title_words = max([j for i in origin_DATA.features.Title for j in i]) +1 
print('all title words = ', all_title_words)
# blurb中所有单词总数
all_blurb_words = max([j for i in origin_DATA.features.Blurb for j in i]) +1 
print('all blurb words = ', all_blurb_words)

all isbn =  38036
all author =  15196
all year =  81
all publisher =  2909
all title words =  23815
all blurb words =  127185


In [10]:
def get_inputs():
    # 用户特征输入
    user_id = keras.layers.Input(shape=(1,), dtype='int32', name='user_id_input')
    user_location = keras.layers.Input(shape=(3,), dtype='int32', name='user_location_input')
    
    # 书籍特征输入
    book_isbn = keras.layers.Input(shape=(1,),  dtype='int32', name='book_isbn_input')
    book_author = keras.layers.Input(shape=(1,),  dtype='int32', name='book_author_input')
    book_year = keras.layers.Input(shape=(1,),  dtype='int32', name='book_year_input')
    book_publisher = keras.layers.Input(shape=(1,),  dtype='int32', name='book_publisher_input')  
    book_title = keras.layers.Input(shape=(15, ), dtype='int32', name='book_title_input')
    book_blurb = keras.layers.Input(shape=(200, ), dtype='int32', name='book_blurb_input')
    return user_id, user_location, book_isbn, book_author, book_year, book_publisher, book_title, book_blurb

In [11]:
# 嵌入矩阵的维度
embed_dim = 16
embed_dim_words = 32

In [12]:
def user_embed_layer(u_id, u_loca):
    user_id_embedd = keras.layers.Embedding(all_user, embed_dim, name='user_id_embedding')(u_id)
    user_loca_embedd = keras.layers.Embedding(all_location, embed_dim , name='user_loca_embedding')(u_loca)
    return user_id_embedd, user_loca_embedd

In [13]:
def book_emded_layer(b_isbn, b_atuhor, b_year, b_publisher, b_title, b_blurb):
    book_isbn_embedd = keras.layers.Embedding(all_isbn, embed_dim, name='book_isbn_embedding')(b_isbn)
    book_author_embedd = keras.layers.Embedding(all_author, embed_dim, name='book_author_embedding')(b_atuhor)
    book_year_embedd = keras.layers.Embedding(all_year, embed_dim, name='book_year_embedding')(b_year)
    book_publisher_embedd = keras.layers.Embedding(all_publisher, embed_dim, name='book_publisher_embedding')(b_publisher)
    
    book_title_embedd = keras.layers.Embedding(all_title_words, embed_dim_words, name='book_title_embedding')(b_title)
    book_blurb_embedd = keras.layers.Embedding(all_blurb_words, embed_dim_words, name='book_blurb_embedding')(b_blurb)
    return book_isbn_embedd, book_author_embedd, book_year_embedd, book_publisher_embedd, book_title_embedd, book_blurb_embedd

In [14]:
def get_user_feature(u_id_embedd, u_loca_embedd):
    u_id_layer = keras.layers.Dense(64, activation='relu', name='u_id_dense')(u_id_embedd)
    # u_id_layer.shape = (?, 1, 64)
    # u_loca_layer.shape = (?, 64)
    # 这里可以再加个Dense
    u_loca_layer = keras.layers.LSTM(32, go_backwards=False, name='u_loca_lstm')(u_loca_embedd)
    u_loca_layer_lstm = keras.layers.Dense(64, activation='relu', name='u_loca_layer_lstm')(u_loca_layer)
    u_id_reshape = keras.layers.Reshape([64])(u_id_layer)
    u_combine = keras.layers.concatenate([u_id_reshape, u_loca_layer_lstm],axis=1, name='u_combine')
    print(u_combine.shape)
    # 这里能不能用激活函数
    u_feature_layer = keras.layers.Dense(200, name='u_feature_layer')(u_combine)
    print(u_feature_layer.shape)
    return u_feature_layer

In [15]:
b_dense = 16
def get_book_feature(b_isbn_embedd, b_author_embedd, b_year_embedd, b_publisher_embedd, b_title_embedd, b_blurb_embedd):
    # 首先对前4个特征连接Dense层
    b_isbn_dense = keras.layers.Dense(b_dense, activation='relu', name='b_isbn_dense')(b_isbn_embedd)
    b_author_dense = keras.layers.Dense(b_dense, activation='relu', name='b_author_dense')(b_author_embedd)
    b_year_dense = keras.layers.Dense(b_dense, activation='relu', name='b_year_dense')(b_year_embedd)
    b_publisher_dense = keras.layers.Dense(b_dense, activation='relu', name='b_publisher_dense')(b_publisher_embedd)
    # 合并这四个特征,  b_combine_four shape = (?, 1, 64)
    b_combine_four = keras.layers.concatenate([b_isbn_dense, b_author_dense, b_year_dense, b_publisher_dense], name='b_four_combine')
    print('b_combine_four.shape', b_combine_four.shape)
    # 对title进行卷积
    b_title_reshape = keras.layers.Lambda(lambda layer: tf.expand_dims(layer, 3))(b_title_embedd)  # shape=(?,15, 32, 1)
    print('b_title_reshape.shape = ', b_title_reshape.shape)
    b_title_conv = keras.layers.Conv2D(filters=8, kernel_size=(2, embed_dim_words), strides=1)(b_title_reshape)# shape=(?, 14, 1, 8)
    b_title_pool = keras.layers.MaxPool2D(pool_size=(14, 1), strides=1)(b_title_conv) # shape=(?,1, 1, 8)
    print('b_title_conv.shape = ', b_title_conv)
    print('b_title_pool.shape = ', b_title_pool)
    
    # 对blurb进行处理
    b_blurb_lstm = keras.layers.LSTM(32, name='b_blurb_lstm')(b_blurb_embedd) # shape = (?, 32)
    print('b_blurb_lstm.shape = ', b_blurb_lstm.shape)
    # 将title和blurb合并
    b_title_reshape = keras.layers.Reshape([b_title_pool.shape[3]])(b_title_pool)
    # b_combine_blurb_title.shape = (?, 40)
    b_combine_blurb_title = keras.layers.concatenate([b_title_reshape, b_blurb_lstm], axis=1, name='b_combine_blurb_title')
    print('b_combine_blurb_title.shape', b_combine_blurb_title.shape)
    b_blurb_title_dense = keras.layers.Dense(64, activation='relu', name='b_blurb_title_dense')(b_combine_blurb_title)
    # b_combine_four_reshape shape = (?, 64)
    b_combine_four_reshape = keras.layers.Reshape([b_combine_four.shape[2]], name='b_combine_four_reshape')(b_combine_four)
    # 合并所有的书籍特征
    b_combine_book = keras.layers.concatenate([b_combine_blurb_title, b_combine_four_reshape], axis=1, name='b_combine_book')
    # 得到书籍矩阵
    b_feature_layer = keras.layers.Dense(200, name='b_feature_layer')(b_combine_book)
    return b_feature_layer

In [16]:
def get_rating(user_feature, book_feature):
    multiply_layer = keras.layers.Lambda(lambda layer: tf.reduce_sum(layer[0]*layer[1], axis=1, keepdims=True), name = 'user_book_feature')((user_feature, book_feature))
    print(multiply_layer.shape)
    return multiply_layer

In [17]:
m = len(origin_DATA.features['Location'])
# 对location取3位数
loca = np.zeros((m, 3))
title = np.zeros((m, 15))
blurb = np.zeros((m, 200))
for i in range(m):
    loca[i] = np.array(origin_DATA.features['Location'][i])
    title[i] = np.array(origin_DATA.features['Title'][i])
    blurb[i] = np.array(origin_DATA.features['Blurb'][i])
print(loca[:-2])
print(title[:-2])
print(blurb[:-2])
input_features = [origin_DATA.features['User-ID'].to_numpy(), loca, 
                  origin_DATA.features['ISBN'].to_numpy(), origin_DATA.features['Author'].to_numpy(),
                 origin_DATA.features['Year'].to_numpy(), origin_DATA.features['Publisher'].to_numpy(), 
                 title, blurb]
labels = origin_DATA.labels.to_numpy()

ValueError: could not broadcast input array from shape (3) into shape (15)

In [18]:
print(input_features[1].shape)

NameError: name 'input_features' is not defined

In [19]:
MODEL_DIR = './model/'

class model_network():
    def __init__(self):
        self.batchsize = 256
        self.epoch = 5
        self.best_loss = 999
    def creat_model(self):
        user_id, user_location, book_isbn, book_author, book_year, book_publisher, book_title, book_blurb = get_inputs()
        user_id_embedd, user_loca_embedd = user_embed_layer(user_id, user_location)
        book_isbn_embedd, book_author_embedd, book_year_embedd, book_publisher_embedd, book_title_embedd, book_blurb_embedd = book_emded_layer(book_isbn, book_author, book_year, book_publisher, book_title, book_blurb)
        u_feature_layer = get_user_feature(user_id_embedd, user_loca_embedd)
        b_feature_layer = get_book_feature(book_isbn_embedd, book_author_embedd, book_year_embedd, book_publisher_embedd, book_title_embedd, book_blurb_embedd)
        multiply_layer = get_rating(u_feature_layer, b_feature_layer)
        model = keras.Model(inputs=[user_id, user_location, book_isbn, book_author, book_year, book_publisher, book_title, book_blurb],
                    outputs=[multiply_layer])
        return model
    def train_model(self):
        model = self.creat_model()
        model.compile(optimizer='adam', loss=keras.losses.mae)
        model.fit(input_features, labels, epochs=5, batch_size=512)
        print(model.summary())

In [20]:
m = model_network()
m.train_model()

(None, 128)
(None, 200)
b_combine_four.shape (None, 1, 64)
b_title_reshape.shape =  (None, 15, 32, 1)
b_title_conv.shape =  Tensor("conv2d/Identity:0", shape=(None, 14, 1, 8), dtype=float32)
b_title_pool.shape =  Tensor("max_pooling2d/Identity:0", shape=(None, 1, 1, 8), dtype=float32)
b_blurb_lstm.shape =  (None, 32)
b_combine_blurb_title.shape (None, 40)
(None, 1)


NameError: name 'input_features' is not defined

In [21]:
model = m.creat_model()
model.summary()

(None, 128)
(None, 200)
b_combine_four.shape (None, 1, 64)
b_title_reshape.shape =  (None, 15, 32, 1)
b_title_conv.shape =  Tensor("conv2d_1/Identity:0", shape=(None, 14, 1, 8), dtype=float32)
b_title_pool.shape =  Tensor("max_pooling2d_1/Identity:0", shape=(None, 1, 1, 8), dtype=float32)
b_blurb_lstm.shape =  (None, 32)
b_combine_blurb_title.shape (None, 40)
(None, 1)
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
book_title_input (InputLayer)   [(None, 15)]         0                                            
__________________________________________________________________________________________________
book_title_embedding (Embedding (None, 15, 32)       762080      book_title_input[0][0]           
__________________________________________________________________________________________________
lambda_1 (Lambda)

In [22]:
keras.utils.plot_model(model, to_file='model_1.png', show_shapes=True, show_layer_names=True)

Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.
