In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter
import tensorflow as tf
from tensorflow.keras import layers,models

import os
import pickle
import re
from tensorflow.python.ops import math_ops

In [234]:
# 读取用户数据：UserID，性别，年龄，职业ID，邮编’
#年龄数字代表的是年龄段：1: "18岁以下"；18: "18-24"；25: "25-34"；35: "35-44"；45: "45-49"；50: "50-55"；56: "56+"
# 每个职业ID对应一种职业
# 这里我们用不到邮编数据
users_title = ['UserID','Gender','Age','JobID','Zip-code']
users = pd.read_table('/Users/zhenwuzhou/.keras/datasets/ml-1m/users.dat',sep='::',header=None,names=users_title,engine='python')
users = users.filter(regex='UserID|Gender|Age|JobID') 
users_orig = users.values # 这个是将users的Value值转换成numberArray的数据
users.head(3)

Unnamed: 0,UserID,Gender,Age,JobID
0,1,F,1,10
1,2,M,56,16
2,3,M,25,15


In [235]:
# 将Gender中的'F'变为0，'M'变为1
gender_map = {'F':0,'M':1}
users['Gender'] = users['Gender'].map(gender_map)
users['Gender']

0       0
1       1
2       1
3       1
4       1
       ..
6035    0
6036    0
6037    0
6038    0
6039    1
Name: Gender, Length: 6040, dtype: int64

In [236]:
set([18,23,56,47,56,27,18,23]) # 速出为{18, 23, 27, 47, 56}一个有序且不重复的数据集合
for ii ,val in enumerate({18, 23, 27, 47, 56}): # 会遍历每个元素，并且ii为此元素的位置，val为此元素的值
    print("位置为:%d的元素是%d"%(ii,val))

位置为:0的元素是47
位置为:1的元素是18
位置为:2的元素是23
位置为:3的元素是56
位置为:4的元素是27


In [13]:
for ii,val in enumerate(set(users['Age'])):
    if ii == 0:
        print(ii,"的值有：",val)
# 

0 的值有： 1


In [237]:
# 读取电影数据:电影ID，电影名称，电影风格
# MovieID是类别字段，Title是文本，Genres也是类别字段
movies_title = ['MovieID','Title','Genres']
movies = pd.read_table('/Users/zhenwuzhou/.keras/datasets/ml-1m/movies.dat',sep='::',header=None,names=movies_title,engine='python')
movies_orig = movies.values
movies.head(3)

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [13]:
#compile 函数用于编译正则表达式，生成一个正则表达式（ Pattern ）对象，供 match() 和 search() 这两个函数使用。
pattern = re.compile(r'^(.*)\((\d+)\)$') # 相当于把最后一个括号前的字符串当做group(1),最后一个括号里面的数字当做group(2)
m = pattern.match("(1995)Toy Story (1995)") # m的值为<re.Match object; span=(0, 22), match='(1995)Toy Story (1995)'>
m.group(0) # '(1995)Toy Story (1995)'
m.group(1) # '(1995)Toy Story '
m.group(2) # '1995'

'1995'

In [14]:
#将Movies中Title的年份去掉
pattern = re.compile(r'(.*)\((\d+)\)$') # '(第一个字符串)((第二个字符串左右两边要有括号且在最后一个货号内))'
title_map1 = {val:pattern.match(val).group(1) for ii,val in enumerate(set(movies['Title']))}
movies['Title'] = movies['Title'].map(title_map1)
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story,Animation|Children's|Comedy
1,2,Jumanji,Adventure|Children's|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama
4,5,Father of the Bride Part II,Comedy


In [15]:
# 将title转换成数字字典，既用表征向量表示
title_set = set()
for val in movies['Title'].str.split(): # 将名字拆分成字符串数组：如[Toy, Story]；
#     print(val) # 每一个val对应一个['Grumpier', 'Old', 'Men']
    title_set.update(val) # 将val中没有在title_set中的单词添加到title_set中
title_set.add('<PAD>')
# 将电影标题分割的词，转换成了字典的形式
title2int = {val:ii for ii,val in enumerate(title_set)}#{'Takes': 0, 'Woke': 1,'Midnight': 2,...}
title2int

{'Ivy:': 0,
 'Black': 1,
 'Monsters': 2,
 'Presidents': 3,
 'Senses': 4,
 'Mononoke,': 5,
 'Sneakers': 6,
 'Videotape': 7,
 'Freejack': 8,
 'Follow': 9,
 'Bleeding': 10,
 'Measures': 11,
 'Hime)': 12,
 'Carmen': 13,
 'Silence': 14,
 'Jungle,': 15,
 'Critical': 16,
 '(Andy': 17,
 '(Oeil': 18,
 'Urban': 19,
 'Assault': 20,
 'Fletch': 21,
 'qiao)': 22,
 'Lover': 23,
 'ans': 24,
 'Violets': 25,
 'Manuscript,': 26,
 'alles': 27,
 'Bergerac': 28,
 'Addams': 29,
 'Cops': 30,
 '(Un': 31,
 "'Em": 32,
 'Rapture,': 33,
 'in': 34,
 'to': 35,
 'Raiders': 36,
 'Soft': 37,
 'Thoughts': 38,
 'Shift': 39,
 'Shines': 40,
 'Tar': 41,
 'Tail:': 42,
 'Clan': 43,
 'Vs.': 44,
 'Prick': 45,
 'Ronin': 46,
 'Girl?': 47,
 'Coast,': 48,
 'Irresistible': 49,
 'Knot': 50,
 'Paulie': 51,
 'Eyes': 52,
 'liebt': 53,
 'Desire': 54,
 'Spindell': 55,
 'Sweet': 56,
 'Election': 57,
 'Grauens)': 58,
 'Dolittle': 59,
 'Day': 60,
 'mera': 61,
 'Book': 62,
 '(Her': 63,
 'Stefano': 64,
 'Future,': 65,
 'Cousin': 66,
 'Myers': 

In [16]:
for ii ,val in enumerate(set(movies['Title'])):
    print(val)
# 输出结果如下
#Close Shave, A 
# Return to Oz 
# Funhouse, The 
# Children of Paradise (Les enfants du paradis) 
# What Lies Beneath 

Great Day in Harlem, A 
Persuasion 
Richard III 
Empty Mirror, The 
Bean 
M. Butterfly 
Breaks, The 
Oliver! 
He Got Game 
Girl on the Bridge, The (La Fille sur le Pont) 
Jules and Jim (Jules et Jim) 
Maximum Overdrive 
Devil's Advocate, The 
Judy Berlin 
It's a Wonderful Life 
Curtis's Charm 
Delta of Venus 
Mosquito Coast, The 
Meatballs Part II 
Hot Spot, The 
General's Daughter, The 
Celebrity 
Grand Hotel 
Little Shop of Horrors 
Greaser's Palace 
Teenage Mutant Ninja Turtles III 
Police Academy 6: City Under Siege 
Runaway 
Halloween 
Inkwell, The 
They Bite 
First Knight 
Old Yeller 
Bronx Tale, A 
Omega Code, The 
Net, The 
Nell 
Freeway 
Six Ways to Sunday 
Whatever It Takes 
Mask, The 
Heathers 
Clay Pigeons 
My Chauffeur 
Twelfth Night 
Stranger in the House 
Othello 
Meatballs 
Ladybird Ladybird 
Girl 6 
Scream 2 
Mr. & Mrs. Smith 
Quest for Camelot 
Barb Wire 
One Flew Over the Cuckoo's Nest 
Fatal Beauty 
Night of the Comet 
Last of the High Kings, The (a.k.a. Summer Flin

Atlantic City 
Entrapment 
Patton 
Buddy Boy 
Commandments 
Savage Nights (Nuits fauves, Les) 
American Psycho 
U.S. Marshalls 
Home Alone 
Treasure of the Sierra Madre, The 
Tampopo 
Otello 
Name of the Rose, The 
Switchback 
Dead Men Don't Wear Plaid 
Searching for Bobby Fischer 
Apt Pupil 
From the Journals of Jean Seberg 
For Ever Mozart 
Coming Apart 
Fandango 
Love Bewitched, A (El Amor Brujo) 
Land Girls, The 
Mighty, The 
Hot Lead and Cold Feet 
Cosi 
Tales From the Crypt Presents: Demon Knight 
Panther 
Boys from Brazil, The 
Gay Deceivers, The 
Thing From Another World, The 
City of the Living Dead (Paura nella citt� dei morti viventi) 
Striking Distance 
Mascara 
Auntie Mame 
Young and Innocent 
Hype! 
Bridge on the River Kwai, The 
Invisible Man, The 
Repulsion 
Postino, Il (The Postman) 
Touch 
Elizabeth 
Brain That Wouldn't Die, The 
Boat, The (Das Boot) 
Honey, I Shrunk the Kids 
Nightmare on Elm Street, A 
Caught Up 
Henry: Portrait of a Serial Killer 
New Rose Hotel 
T

In [17]:
for row in "Children of Paradise (Les enfants du paradis)".split():
    print(row,":",title2int[row])

Children : 4383
of : 698
Paradise : 3022
(Les : 1208
enfants : 194
du : 4801
paradis) : 4045


In [233]:
# 将电影Title转成等长的数字列表，长度15
title_count = 15
# title循环中的val为：“Close Shave, A”  ,tile2int[row]为每一个key为‘Close’，‘Shave,’，‘A’时在title2int中的value(一个数字)
# 最后的val是一组数字组成的数组
title_map2 = {val:[title2int[row] for row in val.split()] for ii ,val in enumerate(set(movies['Title']))}

for key in title_map2: # key就是这样的每一个"Close Shave, A","Return to Oz "电影的名称
    # print(title) # "Close Shave, A"
    # print(title_map2[key]) # [4066, 3272, 3331]
    for cnt in range(title_count-len(title_map2[key])):
        # print(cnt) # 0,1,2这样的数字
        #print(title2int['<PAD>']) #2412
        title_map2[key].insert(len(title_map2[key])+cnt,title2int['<PAD>'])
    # print(title_map2[key]) # [4066, 3272, 3331, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412]
    # 每一次循环就是将不到长度不到15的整型数组补全至15，不到的地方用title2int['<PAD>']=2412补上

movies['Title'] = movies['Title'].map(title_map2)
movies.head(1)

TypeError: unhashable type: 'list'

In [238]:
# 将电影类型转换成数字字典
genres_set = set()
for val in movies['Genres'].str.split('|'):
    genres_set.update(val)
genres_set.add('<PAD>')
genres2int = {val:ii for ii,val in enumerate(genres_set)}

max_genre_length = max(genres2int.values()) # 最多有多少个类型，'<PAD>'也算一个
# 将电影类型转换成等长的数字列表，长度为18
genres_map = {val:[genres2int[row] for row in val.split('|')] for ii,val in enumerate(set(movies['Genres']))}
for key in genres_map:
#     print(genres_map[key])#[8, 9, 12]
    for cnt in range(max_genre_length - len(genres_map[key])):
        genres_map[key].insert(len(genres_map[key])+cnt,genres2int['<PAD>'])
#     print(genres_map[key]) #[8, 9, 12, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
movies['Genres'] = movies['Genres'].map(genres_map)
movies.head(5)

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),"[8, 16, 9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
1,2,Jumanji (1995),"[5, 16, 12, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3..."
2,3,Grumpier Old Men (1995),"[9, 15, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
3,4,Waiting to Exhale (1995),"[9, 14, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
4,5,Father of the Bride Part II (1995),"[9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."


In [239]:
#读取评分数据：用户ID，电影ID，评分，时间戳
# 用户ID取值在1-6040
# 电影ID取值在1-3952
# 评分采用5星制：只有整数星级
# 时间采用以秒为单位，从1970-01-01 00:00:00（UTC 为标准的时间）开始算：当然这里我们用不到时间戳字段
# 每一个用户至少需要用20条评分
ratings_title = ['UserID','MovieID','Rating','timestamps']
ratings = pd.read_table('/Users/zhenwuzhou/.keras/datasets/ml-1m/ratings.dat',sep='::',header=None,names=ratings_title,engine='python')
ratings = ratings.filter(regex='UserID|MovieID|Rating')
ratings.head(100)

Unnamed: 0,UserID,MovieID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
95,2,2490,3
96,2,1834,4
97,2,3471,5
98,2,589,4


In [138]:
# 下面要对数据进行一些预处理操作
# UserID,职业ID和MovieID不用变
# Age字段，转换成7个连续的数字表示0-6：每个代表一个年龄段
# 电影风格Genres字段要转换成数字：首先将电影风格中的每种风格对应的描述词语在One-Hot表示的字典中去查询出来，
# 每个风格单词就对应一个One-Hot表征向量，把所有风格的One-Hot表征向量相加就是这个电影的风格组合描述
# 电影名称的title字段处理方式和电影风格类似，（另外title中的年份需要去掉，年份不应该加到标题特征中）
# 电影风格和电影名称要使用同一个one-hot词典中表示成相同长度的表征向量，方便在神经网络中处理；空白部分用'<PAD>'对应的数字填充
def load_data():
    #================================================User数据处理相关================================================
    #读取User数据
    users_title = ['UserID','Gender','Age','JobID','Zip-code']
    users = pd.read_table('/Users/zhenwuzhou/.keras/datasets/ml-1m/users.dat',sep='::',header=None,names=users_title,engine='python')
    users = users.filter(regex = 'UserID|Gender|Age|JobID')
    users_orig = users.values
    
    # 改变User数据中的性别和年龄
    gender_map = {'F':0,'M':1}
    users['Gender'] = users['Gender'].map(gender_map)
    
    age_map = {val:ii for ii,val in enumerate(set(users['Age']))}
    users['Age'] = users['Age'].map(age_map)
    #     print(users.head(3))
    
    #================================================Movie数据处理相关================================================
    #读取Movie数据集
    movies_title = ['MovieID', 'Title', 'Genres']
    movies = pd.read_table('/Users/zhenwuzhou/.keras/datasets/ml-1m/movies.dat',sep='::',header=None,names=movies_title,engine='python')
    movies_orig = movies.values
    
    #将title中的年份去掉
    pattern = re.compile(r'(.*)\((\d+)\)$') # '(第一个字符串)((第二个字符串左右两边要有括号且在最后一个货号内))'
    
    title_map = {val:pattern.match(val).group(1) for ii,val in enumerate(set(movies['Title']))}
    movies['Title'] = movies['Title'].map(title_map)
    
    #将电影名称转成数字字典中的向量
    # 将title转换成数字字典，既用表征向量表示
    title_set = set()
    for val in movies['Title'].str.split(): # 将名字拆分成字符串数组：如[Toy, Story]；
    #     print(val) # 每一个val对应一个['Grumpier', 'Old', 'Men']
        title_set.update(val) # 将val中没有在title_set中的单词添加到title_set中
    title_set.add('<PAD>')
    # 将电影标题分割的词，转换成了字典的形式
    title2int = {val:ii for ii,val in enumerate(title_set)}#{'Takes': 0, 'Woke': 1,'Midnight': 2,...}
    
    # 将电影Title转成等长的数字列表，长度15
    title_count = 15
    # title循环中的val为：“Close Shave, A”  ,tile2int[row]为每一个key为‘Close’，‘Shave,’，‘A’时在title2int中的value(一个数字)
    # 最后的val是一组数字组成的数组
    title_map2 = {val:[title2int[row] for row in val.split()] for ii ,val in enumerate(set(movies['Title']))}

    for key in title_map2: # key就是这样的每一个"Close Shave, A","Return to Oz "电影的名称
        # print(title) # "Close Shave, A"
        # print(title_map2[key]) # [4066, 3272, 3331]
        for cnt in range(title_count-len(title_map2[key])):
            # print(cnt) # 0,1,2这样的数字
            #print(title2int['<PAD>']) #2412
            title_map2[key].insert(len(title_map2[key])+cnt,title2int['<PAD>'])
        # print(title_map2[key]) # [4066, 3272, 3331, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412]
        # 每一次循环就是将不到长度不到15的整型数组补全至15，不到的地方用title2int['<PAD>']=2412补上
    movies['Title'] = movies['Title'].map(title_map2)
    
    
    # 将电影类型转换成数字字典
    genres_set = set()
    for val in movies['Genres'].str.split('|'):
        genres_set.update(val)
    genres_set.add('<PAD>')
    genres2int = {val:ii for ii,val in enumerate(genres_set)}

    max_genre_length = max(genres2int.values()) # 最多有多少个类型，'<PAD>'也算一个
    # 将电影类型转换成等长的数字列表，长度为18
    genres_map = {val:[genres2int[row] for row in val.split('|')] for ii,val in enumerate(set(movies['Genres']))}
    for key in genres_map:
    #     print(genres_map[key])#[8, 9, 12]
        for cnt in range(max_genre_length - len(genres_map[key])):
            genres_map[key].insert(len(genres_map[key])+cnt,genres2int['<PAD>'])
    #     print(genres_map[key]) #[8, 9, 12, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
    movies['Genres'] = movies['Genres'].map(genres_map)
    
#     print(movies.head(1))
    
    #================================================评分数据处理相关================================================
    ratings_title = ['UserID','MovieID','ratings','timestamps'] # rating是作为训练输出的值
    ratings = pd.read_table('/Users/zhenwuzhou/.keras/datasets/ml-1m/ratings.dat',sep='::',header=None,names=ratings_title,engine='python')
    ratings = ratings.filter(regex='UserID|MovieID|ratings')
    
#     print(ratings.head(100))
    
    
    #================================================数据合并处理相关================================================
    #合并三个表
    data = pd.merge(pd.merge(ratings,users),movies)
    print(data.head(1))

    
    #将数据分成X和Y两张表
    target_fields = ['ratings']
    features_pd, targets_pd = data.drop(target_fields,axis=1),data[target_fields]
    
    features = features_pd.values
    targets_values = targets_pd.values
    
#     print(features.shape)
    
    #title_count：Title字段的长度（15）
    #title_set：Title文本的集合
    #genres2int：电影类型转数字的字典
    #features：是输入X
    #targets_values：是学习目标y
    #ratings：评分数据集的Pandas对象
    #users：用户数据集的Pandas对象
    #movies：电影数据的Pandas对象
    #data：三个数据集组合在一起的Pandas对象
    #movies_orig：没有做数据处理的原始电影数据
    #users_orig：没有做数据处理的原始用户数据
    return title_count, title_set, genres2int, genres_set, features, targets_values, ratings, users, movies, data, movies_orig, users_orig


# 加载数据
title_count, title_set, genres2int, genres_set, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = load_data()

# 存入文件中
pickle.dump((title_count, title_set, genres2int, genres_set, features, targets_values, ratings, users, movies, data, movies_orig, users_orig), open('/Users/zhenwuzhou/.keras/datasets/preprocess.p', 'wb'))

   UserID  MovieID  ratings  Gender  Age  JobID  \
0       1     1193        5       0    0     10   

                                               Title  \
0  [2834, 2117, 2209, 2143, 849, 3037, 5020, 5020...   

                                              Genres  
0  [14, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...  


In [56]:
users.head()

Unnamed: 0,UserID,Gender,Age,JobID
0,1,0,0,10
1,2,1,5,16
2,3,1,6,15
3,4,1,2,7
4,5,1,6,20


In [57]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,"[2661, 2008, 2518, 2518, 2518, 2518, 2518, 251...","[9, 2, 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
1,2,"[5148, 2518, 2518, 2518, 2518, 2518, 2518, 251...","[8, 2, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
2,3,"[5196, 4005, 1372, 2518, 2518, 2518, 2518, 251...","[10, 13, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3..."
3,4,"[3725, 2039, 921, 2518, 2518, 2518, 2518, 2518...","[10, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
4,5,"[1592, 4628, 2018, 4647, 2270, 3370, 2518, 251...","[10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."


In [51]:
movies.values[0]

array([1,
       list([2661, 2008, 2518, 2518, 2518, 2518, 2518, 2518, 2518, 2518, 2518, 2518, 2518, 2518, 2518]),
       list([9, 2, 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])],
      dtype=object)

In [None]:
#数据预处理工作已经完成，下面就用预处理完的数据进行建模

In [2]:
title_count, title_set, genres2int, genres_set, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = pickle.load(open('/Users/zhenwuzhou/.keras/datasets/preprocess.p', mode='rb'))

In [219]:
targets_values = targets_values.reshape(targets_values.shape[0])

In [220]:
targets_values.shape

(1000209,)

In [223]:
targets_values

array([5, 5, 4, ..., 1, 5, 4])

In [8]:
data.head(100)#UserID	MovieID	ratings	Gender	Age	JobID	Title	Genres

Unnamed: 0,UserID,MovieID,ratings,Gender,Age,JobID,Title,Genres
0,1,1193,5,0,0,10,"[930, 4566, 5136, 3249, 4918, 3847, 1538, 1538...","[12, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1..."
1,2,1193,5,1,5,16,"[930, 4566, 5136, 3249, 4918, 3847, 1538, 1538...","[12, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1..."
2,12,1193,4,1,6,12,"[930, 4566, 5136, 3249, 4918, 3847, 1538, 1538...","[12, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1..."
3,15,1193,4,1,6,7,"[930, 4566, 5136, 3249, 4918, 3847, 1538, 1538...","[12, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1..."
4,17,1193,5,1,3,1,"[930, 4566, 5136, 3249, 4918, 3847, 1538, 1538...","[12, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1..."
...,...,...,...,...,...,...,...,...
95,329,1193,4,1,1,7,"[930, 4566, 5136, 3249, 4918, 3847, 1538, 1538...","[12, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1..."
96,331,1193,4,1,6,7,"[930, 4566, 5136, 3249, 4918, 3847, 1538, 1538...","[12, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1..."
97,332,1193,5,1,3,1,"[930, 4566, 5136, 3249, 4918, 3847, 1538, 1538...","[12, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1..."
98,333,1193,3,1,1,2,"[930, 4566, 5136, 3249, 4918, 3847, 1538, 1538...","[12, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1..."


In [96]:
print(features.shape)
features[0]#UserID,MovieID,Gender,Age,JobID,Title,Genres

(1000209, 7)


array([1, 1193, 0, 0, 10,
       list([930, 4566, 5136, 3249, 4918, 3847, 1538, 1538, 1538, 1538, 1538, 1538, 1538, 1538, 1538]),
       list([12, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18])],
      dtype=object)

In [4]:
#处理User的信息来构建生成用户特征向量的模型
user_embed_dim = 32

#==================用户Uid信息处理
uid_array = features.take(0,1) # array([1, 2, 12, ..., 5780, 5851, 5938], dtype=object)
uid_max = max(uid_array)+1 # 6040+1 用户Id的个数
uid_inputs = layers.Input(shape=())
embedded_uid = layers.Embedding(uid_max,user_embed_dim,input_length=1)(uid_inputs) # 嵌入后的uid
flattened_embedded_uid = layers.Flatten()(embedded_uid) # 打平

#==================用户性别信息处理
gender_array = features.take(2,1)# array([0, 1, 1, ..., 1, 0, 1], dtype=object)
gender_max = max(gender_array)+1 # 1+1=2 用户性别个数
gender_inputs = layers.Input(shape=())
embedded_gender = layers.Embedding(gender_max,user_embed_dim,input_length=1)(gender_inputs) # 嵌入后的性别
flattened_embedded_gender = layers.Flatten()(embedded_gender)

#==================用户年龄信息处理
age_class_array = features.take(3,1) # array([0, 5, 6, ..., 4, 4, 6], dtype=object)
age_class_max = max(age_class_array)+1 # 6+1 年龄类别个数
age_class_inputs = layers.Input(shape=())
embedded_age_class = layers.Embedding(age_class_max,user_embed_dim,input_length=1)(age_class_inputs) # 嵌入后的年龄类别
flattened_embedded_age_class = layers.Flatten()(embedded_age_class)

#==================用户职业信息处理
job_array = features.take(4,1) # array([10, 16, 12, ..., 17, 20, 1], dtype=object)
job_max = max(job_array)+1 # 20+1 职业个数
job_inputs = layers.Input(shape=())
embedded_job = layers.Embedding(job_max,user_embed_dim,input_length=1)(job_inputs)#嵌入后的职业
flattened_embedded_job =  layers.Flatten()(embedded_job)

#==================将打平后的嵌入式信息进行连接:(1000209, 128)
user_concatenate = layers.concatenate(
    [flattened_embedded_uid,flattened_embedded_gender,flattened_embedded_age_class,flattened_embedded_job])

#==================进行全连接的神经网络搭建
user_feature_layer1 = layers.Dense(200)(user_concatenate)
user_final_layer = layers.Dense(32)(user_feature_layer1)

#==================最后构建出用户特征提取的模型：模型最终将输出（1000209, 32）的矩阵数据
model_user_feature = models.Model(inputs=[uid_inputs,gender_inputs,age_class_inputs,job_inputs],outputs=user_final_layer)

In [5]:
out = model_user_feature.predict([uid_array,gender_array,age_class_array,job_array])
print(out.shape)

(1000209, 32)


In [201]:
uid_array =  uid_array.reshape(uid_array.shape[0],1)
uid_array.shape

(1000209, 1)

In [25]:
def list_to_array(list):
    array =[]
    for val in list:
        array.append(np.array(val,dtype=int))
    return np.array(array,dtype=int)

# 先把这个放在这里，因为比较耗时，最终要写到模型方法里面去
movie_title_array = list_to_array(features.take(5,1)) # 注意这里比较坑，需要把list转换成array


In [186]:
movie_title_array.shape

(1000209, 15)

In [7]:
movie_genre_array = list_to_array(features.take(6,1)) # 注意这里同样要把list转换成array

In [103]:
movie_genre_array.shape

(1000209, 18)

In [8]:
# 处理电影数据来构建生成电影特征向量的模型
movie_embed_dim = 32

#===========电影ID信息处理
movie_id_array = features.take(1,1) # array([1193, 1193, 1193, ..., 2845, 3607, 2909], dtype=object)
movie_id_max = max(movie_id_array)+1 #3952+1
movie_id_inputs = layers.Input(shape=())
embedded_movie_id =  layers.Embedding(movie_id_max,movie_embed_dim,input_length=1)(movie_id_inputs)
flattened_embedded_movie_id = layers.Flatten()(embedded_movie_id)



#===========电影名称信息处理：如果是标题需要卷积加池化，如果是电影标题特征分词那么可以不用卷积池化
#movie_title_array = list_to_array(features.take(5,1)) # 注意这里比较坑，需要把list转换成array
max_movie_title = len(title_set) # 5215
movie_title_input_length = len(movie_title_array[0]) # 15
movie_title_inputs = layers.Input(shape=(movie_title_input_length,))
embedded_movie_title = layers.Embedding(max_movie_title,movie_embed_dim,input_length=movie_title_input_length)(movie_title_inputs)
# 要进行卷积操作需要将shape(None,15,32)变为shape(None,15,32，1)
conv_inputs = layers.Reshape((movie_title_input_length,movie_embed_dim,1))(embedded_movie_title)
movie_title_window_sizes = {2, 3, 4, 5} # 文本卷积滑动窗口，分别滑动2, 3, 4, 5个单词
movie_title_conv_filter_num = 8 # 卷积操作时需要的过滤器个数
conv_pool_movie_title_layer_list = [] # 卷积池化后的每一层
for window_size in movie_title_window_sizes:
    conv_layer = layers.Conv2D(filters=movie_title_conv_filter_num,
                  kernel_size=(window_size,movie_embed_dim),activation='relu')(conv_inputs)
    
    # 在执行完文本卷积操作后，length变为了（movie_title_input_length-window_size+1）
    pool_layer = layers.MaxPool2D(pool_size=(movie_title_input_length-window_size+1,1))(conv_layer)
    
    conv_pool_movie_title_layer_list.append(pool_layer)
    
#将池化后的结果连接在一起：由4个(None, 1, 1, 8)变为(None, 1, 1, 32)
movie_title_concatenate_layer = layers.concatenate(conv_pool_movie_title_layer_list)
# 最终打平后的电影标题(None, 32)
flattened_final_movie_title = layers.Flatten()(movie_title_concatenate_layer)


#===========电影类型标签信息处理：先embeding然后把特征向量做加和,最后生成一个(None, 32)的嵌入表征来代表18个标签的组合值
# movie_genre_array = list_to_array(features.take(6,1)) # 注意这里同样要把list转换成array
max_movie_genre = len(genres_set) # 19
movie_genre_input_length = len(movie_genre_array[0]) #18
# 先embeding，然后把特征向量做加和
movie_genre_inputs = layers.Input(shape=(movie_genre_input_length,))
embedded_movie_genre = layers.Embedding(max_movie_genre,movie_embed_dim,input_length=movie_genre_input_length)(movie_genre_inputs)
# 将所有嵌入后的特征：注意axis=1，最后shape=(None, 32)，因为每32个代表一个类型标签，我们要把18个类型标签整合成一个新的32维向量，
#这个新的32维向量包含了18个特征的值
sumed_embedded_movie_genre = tf.reduce_sum(embedded_movie_genre,axis=1,keepdims=False)


#===========将电影ID、电影名称、电影类型标签进行整合:shape=(None, 96)
movie_concatenate = layers.concatenate([flattened_embedded_movie_id,flattened_final_movie_title,sumed_embedded_movie_genre])

#===========进行全连接的神经网络搭建
movie_feature_layer1= layers.Dense(200)(movie_concatenate)
movie_final_layer = layers.Dense(32)(movie_feature_layer1)


model_movie_feature = models.Model(inputs=[movie_id_inputs,movie_title_inputs,movie_genre_inputs],outputs=movie_final_layer)

In [9]:
out = model_movie_feature.predict([movie_id_array, movie_title_array,movie_genre_array])
print(out.shape)

(1000209, 32)


In [10]:
movie_id_array.shape

(1000209,)

In [32]:
#最后整合模型：用用户特征与电影特征结合起来构成完整的评分模型
#用户特征与电影特征相乘
layerMultiply = layers.multiply([model_user_feature.output,model_movie_feature.output])
#用相乘之后的值再相加:注意这里一定要keepdims=True，不然输出shape为（None，），这样在梯度下降的时候会有错误
final_rate_layer = tf.reduce_sum(layerMultiply,axis=1,keepdims=True)
# final_rate_layer = layers.Dense(1)(layerMultiply)


#构建最终的评分预测模型
rate_predict_model = models.Model(inputs=[model_user_feature.inputs,model_movie_feature.inputs],outputs=final_rate_layer)

rate_predict_model.compile(optimizer=tf.keras.optimizers.Adam(),loss='mse',metrics=['acc'])

In [23]:
rate_predict = rate_predict_model.predict([uid_array,gender_array,age_class_array,job_array,movie_id_array, movie_title_array,movie_genre_array])
rate_predict.shape

(1000209, 1)

In [24]:
rate_predict

array([[-2.978444 ],
       [-3.4054985],
       [-3.355185 ],
       ...,
       [-1.6966317],
       [-2.0689936],
       [-1.7256342]], dtype=float32)

In [20]:
targets_values.shape

(1000209, 1)

In [61]:
# 取50000条数据作为测试数据
test_size = 50000
random_state = 666
train_uid_array,test_uid_array = train_test_split(uid_array,test_size=test_size,random_state=random_state)
train_gender_array,test_gender_array = train_test_split(gender_array,test_size=test_size,random_state=random_state)
train_age_class_array,test_age_class_array = train_test_split(age_class_array,test_size=test_size,random_state=random_state)
train_job_array,test_job_array = train_test_split(job_array,test_size=test_size,random_state=random_state)
train_movie_id_array,test_movie_id_array = train_test_split(movie_id_array,test_size=test_size,random_state=random_state)
train_movie_title_array,test_movie_title_array = train_test_split(movie_title_array,test_size=test_size,random_state=random_state)
train_movie_genre_array,test_movie_genre_array = train_test_split(movie_genre_array,test_size=test_size,random_state=random_state)

train_targets_values,test_targets_values = train_test_split(targets_values,test_size=test_size,random_state=random_state)

In [47]:
test_uid_array

array([4682, 5326, 1004, ..., 905, 5283, 346], dtype=object)

In [35]:
test_uid_array.shape

(50000,)

In [59]:
inputs = [train_uid_array, train_gender_array, train_age_class_array, train_job_array,
              train_movie_id_array, train_movie_title_array, train_movie_genre_array]
inputs

[array([4294, 5488, 3336, ..., 4448, 3792, 4201], dtype=object),
 array([1, 1, 1, ..., 1, 1, 0], dtype=object),
 array([0, 6, 1, ..., 6, 6, 6], dtype=object),
 array([10, 15, 17, ..., 14, 6, 0], dtype=object),
 array([2354, 685, 1346, ..., 2542, 2941, 838], dtype=object),
 array([[3084,  451, 1897, ..., 5020, 5020, 5020],
        [3218, 2319, 3836, ..., 5020, 5020, 5020],
        [ 605, 1312, 5020, ..., 5020, 5020, 5020],
        ...,
        [2926, 3393, 1162, ..., 5020, 5020, 5020],
        [4166, 3621, 5020, ..., 5020, 5020, 5020],
        [1122, 5020, 5020, ..., 5020, 5020, 5020]]),
 array([[ 8, 16,  9, ...,  3,  3,  3],
        [14,  3,  3, ...,  3,  3,  3],
        [ 0,  3,  3, ...,  3,  3,  3],
        ...,
        [ 9,  4, 18, ...,  3,  3,  3],
        [13, 15,  7, ...,  3,  3,  3],
        [ 9, 14, 15, ...,  3,  3,  3]])]

In [54]:
print(train_age_class_array)
train_age_class_array = np.array(train_age_class_array,dtype=int)
print(train_age_class_array)

[0 6 1 ... 6 6 6]
[0 6 1 ... 6 6 6]


In [None]:
train_job_array

In [56]:
inputs = [train_uid_array, train_gender_array, train_age_class_array, train_job_array,
              train_movie_id_array, train_movie_title_array, train_movie_genre_array]
inputs

[array([4294, 5488, 3336, ..., 4448, 3792, 4201]),
 array([1, 1, 1, ..., 1, 1, 0]),
 array([0, 6, 1, ..., 6, 6, 6]),
 array([10, 15, 17, ..., 14, 6, 0], dtype=object),
 array([2354, 685, 1346, ..., 2542, 2941, 838], dtype=object),
 array([[3084,  451, 1897, ..., 5020, 5020, 5020],
        [3218, 2319, 3836, ..., 5020, 5020, 5020],
        [ 605, 1312, 5020, ..., 5020, 5020, 5020],
        ...,
        [2926, 3393, 1162, ..., 5020, 5020, 5020],
        [4166, 3621, 5020, ..., 5020, 5020, 5020],
        [1122, 5020, 5020, ..., 5020, 5020, 5020]]),
 array([[ 8, 16,  9, ...,  3,  3,  3],
        [14,  3,  3, ...,  3,  3,  3],
        [ 0,  3,  3, ...,  3,  3,  3],
        ...,
        [ 9,  4, 18, ...,  3,  3,  3],
        [13, 15,  7, ...,  3,  3,  3],
        [ 9, 14, 15, ...,  3,  3,  3]])]

In [62]:
# 把训练数据都转为float类型，不然在tf2.0以下会报错
train_uid_array = np.array(train_uid_array, dtype=float)
train_gender_array = np.array(train_gender_array, dtype=float)
train_age_class_array = np.array(train_age_class_array, dtype=float)
train_job_array = np.array(train_job_array, dtype=float)
train_movie_id_array = np.array(train_movie_id_array, dtype=float)
train_movie_title_array = np.array(train_movie_title_array, dtype=float)
train_movie_genre_array = np.array(train_movie_genre_array, dtype=float)
train_targets_values = np.array(train_targets_values, dtype=float)

# 把测试数据也都转为float类型
test_uid_array = np.array(test_uid_array, dtype=float)
test_gender_array = np.array(test_gender_array, dtype=float)
test_age_class_array = np.array(test_age_class_array, dtype=float)
test_job_array = np.array(test_job_array, dtype=float)
test_movie_id_array = np.array(test_movie_id_array, dtype=float)
test_movie_title_array = np.array(test_movie_title_array, dtype=float)
test_movie_genre_array = np.array(test_movie_genre_array, dtype=float)
test_targets_values = np.array(test_targets_values, dtype=float)
history = rate_predict_model.fit(
        [train_uid_array, train_gender_array, train_age_class_array, train_job_array,
         train_movie_id_array, train_movie_title_array, train_movie_genre_array],
        train_targets_values, batch_size=32, epochs=100,
        validation_data=(
            [test_uid_array, test_gender_array, test_age_class_array, test_job_array,
             test_movie_id_array, test_movie_title_array, test_movie_genre_array],
            test_targets_values))

Train on 950209 samples, validate on 50000 samples
Epoch 1/100
 15040/950209 [..............................] - ETA: 2:27 - loss: 0.9834 - acc: 0.0570

KeyboardInterrupt: 

In [None]:
# TensorBoard的使用
#设定格式化模型名称，以时间戳作为标记
model_name = model.name.format(int(time.time()))
#设定存储位置，每个模型不一样的路径
tensorboard = TensorBoardcallback = tf.keras.callbacks.TensorBoard(
    log_dir='/Users/zhenwuzhou/TensorBoardLogs/tensorboardLogs'.format(model_name),
    histogram_freq=1, batch_size=32,
    write_graph=True, write_grads=False, write_images=True,
    embeddings_freq=0, embeddings_layer_names=None,
    embeddings_metadata=None, embeddings_data=None, update_freq=500
)
#使用它
model.fit(train_image, train_lable, batch_size =32, epochs=10, validation_split=0.1, callbacks=[tensorboard])