In [4]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter
import tensorflow as tf

import os
import pickle
import re
from tensorflow.python.ops import math_ops

In [5]:
# 读取用户数据：UserID，性别，年龄，职业ID，邮编’
#年龄数字代表的是年龄段：1: "18岁以下"；18: "18-24"；25: "25-34"；35: "35-44"；45: "45-49"；50: "50-55"；56: "56+"
# 每个职业ID对应一种职业
# 这里我们用不到邮编数据
users_title = ['UserID','Gender','Age','JobID','Zip-code']
users = pd.read_table('./datasets/ml-1m/users.dat',sep='::',header=None,names=users_title,engine='python')
users = users.filter(regex='UserID|Gender|Age|JobID') 
users_orig = users.values # 这个是将users的Value值转换成numberArray的数据
users.head(3)

Unnamed: 0,UserID,Gender,Age,JobID
0,1,F,1,10
1,2,M,56,16
2,3,M,25,15


In [6]:
# 将Gender中的'F'变为0，'M'变为1
gender_map = {'F':0,'M':1}
users['Gender'] = users['Gender'].map(gender_map)
users['Gender']

0       0
1       1
2       1
3       1
4       1
       ..
6035    0
6036    0
6037    0
6038    0
6039    1
Name: Gender, Length: 6040, dtype: int64

In [12]:
set([18,23,56,47,56,27,18,23]) # 速出为{18, 23, 27, 47, 56}一个有序且不重复的数据集合
for ii ,val in enumerate({18, 23, 27, 47, 56}): # 会遍历每个元素，并且ii为此元素的位置，val为此元素的值
    print("位置为:%d的元素是%d"%(ii,val))

位置为:0的元素是47
位置为:1的元素是18
位置为:2的元素是23
位置为:3的元素是56
位置为:4的元素是27


In [13]:
for ii,val in enumerate(set(users['Age'])):
    if ii == 0:
        print(ii,"的值有：",val)
# 

0 的值有： 1


In [14]:
# 读取电影数据:电影ID，电影名称，电影风格
# MovieID是类别字段，Title是文本，Genres也是类别字段
movies_title = ['MovieID','Title','Genres']
movies = pd.read_table('./datasets/ml-1m/movies.dat',sep='::',header=None,names=movies_title,engine='python')
movies_orig = movies.values
movies.head(3)

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [15]:
#compile 函数用于编译正则表达式，生成一个正则表达式（ Pattern ）对象，供 match() 和 search() 这两个函数使用。
pattern = re.compile(r'^(.*)\((\d+)\)$') # 相当于把最后一个括号前的字符串当做group(1),最后一个括号里面的数字当做group(2)
m = pattern.match("(1995)Toy Story (1995)") # m的值为<re.Match object; span=(0, 22), match='(1995)Toy Story (1995)'>
m.group(0) # '(1995)Toy Story (1995)'
m.group(1) # '(1995)Toy Story '
m.group(2) # '1995'

'1995'

In [16]:
#将Movies中Title的年份去掉
pattern = re.compile(r'(.*)\((\d+)\)$') # '(第一个字符串)((第二个字符串左右两边要有括号且在最后一个货号内))'
title_map1 = {val:pattern.match(val).group(1) for ii,val in enumerate(set(movies['Title']))}
movies['Title'] = movies['Title'].map(title_map1)
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story,Animation|Children's|Comedy
1,2,Jumanji,Adventure|Children's|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama
4,5,Father of the Bride Part II,Comedy


In [17]:
# 将title转换成数字字典，既用表征向量表示
title_set = set()
for val in movies['Title'].str.split(): # 将名字拆分成字符串数组：如[Toy, Story]；
#     print(val) # 每一个val对应一个['Grumpier', 'Old', 'Men']
    title_set.update(val) # 将val中没有在title_set中的单词添加到title_set中
title_set.add('<PAD>')
# 将电影标题分割的词，转换成了字典的形式
title2int = {val:ii for ii,val in enumerate(title_set)}#{'Takes': 0, 'Woke': 1,'Midnight': 2,...}
title2int

{'Seasons,': 0,
 'Doom': 1,
 'II,': 2,
 'Pajama': 3,
 'Immortal': 4,
 'I': 5,
 'Noughts,': 6,
 'Heavy': 7,
 'Taming': 8,
 'Fletch': 9,
 'Aim�e': 10,
 'Risk': 11,
 'Newsies': 12,
 'Broomsticks': 13,
 'Incredibly': 14,
 'Bananas': 15,
 'Fe': 16,
 'Hate': 17,
 'Pather': 18,
 'Peeping': 19,
 'D�ner': 20,
 'Seven': 21,
 'Bridges': 22,
 'Factor': 23,
 'Andre': 24,
 'Bomb': 25,
 'Shots': 26,
 'Splendor': 27,
 'Tron': 28,
 '13': 29,
 'Ado': 30,
 'Surf': 31,
 'Carolina': 32,
 'Firewalker': 33,
 'Winslow': 34,
 '(Callej�n': 35,
 'There,': 36,
 'Reality': 37,
 'Seventh': 38,
 'Contempt': 39,
 'Pope': 40,
 'Rescuers': 41,
 'Defying': 42,
 'r�v�e': 43,
 'Plot': 44,
 '(Bacheha-Ye': 45,
 "(C'est": 46,
 'Zwerge': 47,
 'Oasis,': 48,
 'Calendar': 49,
 'Indochine': 50,
 'Addiction,': 51,
 'Fly,': 52,
 'vida': 53,
 'Be': 54,
 'World:': 55,
 'August,': 56,
 'Insult': 57,
 'Screwed': 58,
 'Tinseltown': 59,
 'Barb': 60,
 '(Boca': 61,
 'Living': 62,
 'Haunting,': 63,
 'Police': 64,
 'Nina': 65,
 'Aiqing': 66,

In [18]:
for ii ,val in enumerate(set(movies['Title'])):
    print(val)
# 输出结果如下
#Close Shave, A 
# Return to Oz 
# Funhouse, The 
# Children of Paradise (Les enfants du paradis) 
# What Lies Beneath 

Supernova 
Freejack 
Ashes of Time 
Tom Jones 
Trick 
Moonstruck 
Addams Family, The 
American Werewolf in Paris, An 
Man in the Iron Mask, The 
Condition Red 
Gossip 
Flatliners 
Pagemaster, The 
Jumpin' Jack Flash 
Stardust Memories 
On the Town 
Poison Ivy 
French Twist (Gazon maudit) 
Train of Life (Train De Vie) 
Breaking Away 
Sophie's Choice 
Days of Thunder 
Now and Then 
Bronx Tale, A 
I Shot Andy Warhol 
Blood Beach 
D2: The Mighty Ducks 
Thing From Another World, The 
Sesame Street Presents Follow That Bird 
High Fidelity 
Holy Man 
New York Cop 
I'll Be Home For Christmas 
Associate, The 
Clean Slate 
Blood & Wine 
Major Payne 
Donnie Brasco 
Red Rock West 
Selena 
Elstree Calling 
West Beirut (West Beyrouth) 
Things Change 
Niagara, Niagara 
Cleo From 5 to 7 (Cl�o de 5 � 7) 
I'll Do Anything 
Reluctant Debutante, The 
Rushmore 
Purple Rose of Cairo, The 
Robin Hood 
Speechless 
Boricua's Bond 
Mulan 
Last Action Hero 
Under Capricorn 
Lawnmower Man, The 
Airport 
Random He

Bad Company 
From Here to Eternity 
Soft Fruit 
Cookie's Fortune 
Great Expectations 
Instinct 
Until the End of the World (Bis ans Ende der Welt) 
Cecil B. Demented 
Vie est belle, La (Life is Rosey) 
Desert Bloom 
Night of the Living Dead 
8 Heads in a Duffel Bag 
Angel Heart 
Lord of Illusions 
Next Karate Kid, The 
Celebrity 
Rocky II 
Bait 
Lost & Found 
House on Haunted Hill, The 
Better Living 
Margaret's Museum 
Great Day in Harlem, A 
Ilsa, She Wolf of the SS 
Powder 
Chamber, The 
Railroaded! 
Criminals 
Hard Day's Night, A 
Jade 
North Dallas Forty 
Meatballs 4 
McCabe & Mrs. Miller 
Wanted: Dead or Alive 
Crocodile Dundee II 
Last of the High Kings, The (a.k.a. Summer Fling) 
I Don't Want to Talk About It (De eso no se habla) 
Heaven's Prisoners 
Ice Storm, The 
Battle of the Sexes, The 
Who Framed Roger Rabbit? 
Beyond the Mat 
Braindead 
Hell Night 
Shattered Image 
Licence to Kill 
Stars Fell on Henrietta, The 
Night to Remember, A 
Saludos Amigos 
Bed of Roses 
Gay Divo

In [19]:
for row in "Children of Paradise (Les enfants du paradis)".split():
    print(row,":",title2int[row])

Children : 2713
of : 2085
Paradise : 2717
(Les : 2894
enfants : 3360
du : 1021
paradis) : 469


In [20]:
# 将电影Title转成等长的数字列表，长度15
title_count = 15
# title循环中的val为：“Close Shave, A”  ,tile2int[row]为每一个key为‘Close’，‘Shave,’，‘A’时在title2int中的value(一个数字)
# 最后的val是一组数字组成的数组
title_map2 = {val:[title2int[row] for row in val.split()] for ii ,val in enumerate(set(movies['Title']))}

for key in title_map2: # key就是这样的每一个"Close Shave, A","Return to Oz "电影的名称
    # print(title) # "Close Shave, A"
    # print(title_map2[key]) # [4066, 3272, 3331]
    for cnt in range(title_count-len(title_map2[key])):
        # print(cnt) # 0,1,2这样的数字
        #print(title2int['<PAD>']) #2412
        title_map2[key].insert(len(title_map2[key])+cnt,title2int['<PAD>'])
    # print(title_map2[key]) # [4066, 3272, 3331, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412]
    # 每一次循环就是将不到长度不到15的整型数组补全至15，不到的地方用title2int['<PAD>']=2412补上

movies['Title'] = movies['Title'].map(title_map2)
movies.head(1)

Unnamed: 0,MovieID,Title,Genres
0,1,"[3170, 826, 1538, 1538, 1538, 1538, 1538, 1538...",Animation|Children's|Comedy


In [21]:
# 将电影类型转换成数字字典
genres_set = set()
for val in movies['Genres'].str.split('|'):
    genres_set.update(val)
genres_set.add('<PAD>')
genres2int = {val:ii for ii,val in enumerate(genres_set)}

max_genre_length = max(genres2int.values()) # 最多有多少个类型，'<PAD>'也算一个
# 将电影类型转换成等长的数字列表，长度为18
genres_map = {val:[genres2int[row] for row in val.split('|')] for ii,val in enumerate(set(movies['Genres']))}
for key in genres_map:
#     print(genres_map[key])#[8, 9, 12]
    for cnt in range(max_genre_length - len(genres_map[key])):
        genres_map[key].insert(len(genres_map[key])+cnt,genres2int['<PAD>'])
#     print(genres_map[key]) #[8, 9, 12, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
movies['Genres'] = movies['Genres'].map(genres_map)
movies.head(1)

Unnamed: 0,MovieID,Title,Genres
0,1,"[3170, 826, 1538, 1538, 1538, 1538, 1538, 1538...","[3, 17, 16, 18, 18, 18, 18, 18, 18, 18, 18, 18..."


In [22]:
#读取评分数据：用户ID，电影ID，评分，时间戳
# 用户ID取值在1-6040
# 电影ID取值在1-3952
# 评分采用5星制：只有整数星级
# 时间采用以秒为单位，从1970-01-01 00:00:00（UTC 为标准的时间）开始算：当然这里我们用不到时间戳字段
# 每一个用户至少需要用20条评分
ratings_title = ['UserID','MovieID','Rating','timestamps']
ratings = pd.read_table('./datasets/ml-1m/ratings.dat',sep='::',header=None,names=ratings_title,engine='python')
ratings = ratings.filter(regex='UserID|MovieID|Rating')
ratings.head(100)

Unnamed: 0,UserID,MovieID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
95,2,2490,3
96,2,1834,4
97,2,3471,5
98,2,589,4


In [23]:
# 下面要对数据进行一些预处理操作
# UserID,职业ID和MovieID不用变
# Age字段，转换成7个连续的数字表示0-6：每个代表一个年龄段
# 电影风格Genres字段要转换成数字：首先将电影风格中的每种风格对应的描述词语在One-Hot表示的字典中去查询出来，
# 每个风格单词就对应一个One-Hot表征向量，把所有风格的One-Hot表征向量相加就是这个电影的风格组合描述
# 电影名称的title字段处理方式和电影风格类似，（另外title中的年份需要去掉，年份不应该加到标题特征中）
# 电影风格和电影名称要使用同一个one-hot词典中表示成相同长度的表征向量，方便在神经网络中处理；空白部分用'<PAD>'对应的数字填充
def load_data():
    #================================================User数据处理相关================================================
    #读取User数据
    users_title = ['UserID','Gender','Age','JobID','Zip-code']
    users = pd.read_table('./datasets/ml-1m/users.dat',sep='::',header=None,names=users_title,engine='python')
    users = users.filter(regex = 'UserID|Gender|Age|JobID')
    users_orig = users.values
    
    # 改变User数据中的性别和年龄
    gender_map = {'F':0,'M':1}
    users['Gender'] = users['Gender'].map(gender_map)
    
    age_map = {val:ii for ii,val in enumerate(set(users['Age']))}
    users['Age'] = users['Age'].map(age_map)
    #     print(users.head(3))
    
    #================================================Movie数据处理相关================================================
    #读取Movie数据集
    movies_title = ['MovieID', 'Title', 'Genres']
    movies = pd.read_table('./datasets/ml-1m/movies.dat',sep='::',header=None,names=movies_title,engine='python')
    movies_orig = movies.values
    
    #将title中的年份去掉
    pattern = re.compile(r'(.*)\((\d+)\)$') # '(第一个字符串)((第二个字符串左右两边要有括号且在最后一个货号内))'
    
    title_map = {val:pattern.match(val).group(1) for ii,val in enumerate(set(movies['Title']))}
    movies['Title'] = movies['Title'].map(title_map)
    
    #将电影名称转成数字字典中的向量
    # 将title转换成数字字典，既用表征向量表示
    title_set = set()
    for val in movies['Title'].str.split(): # 将名字拆分成字符串数组：如[Toy, Story]；
    #     print(val) # 每一个val对应一个['Grumpier', 'Old', 'Men']
        title_set.update(val) # 将val中没有在title_set中的单词添加到title_set中
    title_set.add('<PAD>')
    # 将电影标题分割的词，转换成了字典的形式
    title2int = {val:ii for ii,val in enumerate(title_set)}#{'Takes': 0, 'Woke': 1,'Midnight': 2,...}
    
    # 将电影Title转成等长的数字列表，长度15
    title_count = 15
    # title循环中的val为：“Close Shave, A”  ,tile2int[row]为每一个key为‘Close’，‘Shave,’，‘A’时在title2int中的value(一个数字)
    # 最后的val是一组数字组成的数组
    title_map2 = {val:[title2int[row] for row in val.split()] for ii ,val in enumerate(set(movies['Title']))}

    for key in title_map2: # key就是这样的每一个"Close Shave, A","Return to Oz "电影的名称
        # print(title) # "Close Shave, A"
        # print(title_map2[key]) # [4066, 3272, 3331]
        for cnt in range(title_count-len(title_map2[key])):
            # print(cnt) # 0,1,2这样的数字
            #print(title2int['<PAD>']) #2412
            title_map2[key].insert(len(title_map2[key])+cnt,title2int['<PAD>'])
        # print(title_map2[key]) # [4066, 3272, 3331, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412]
        # 每一次循环就是将不到长度不到15的整型数组补全至15，不到的地方用title2int['<PAD>']=2412补上
    movies['Title'] = movies['Title'].map(title_map2)
    
    
    # 将电影类型转换成数字字典
    genres_set = set()
    for val in movies['Genres'].str.split('|'):
        genres_set.update(val)
    genres_set.add('<PAD>')
    genres2int = {val:ii for ii,val in enumerate(genres_set)}

    max_genre_length = max(genres2int.values()) # 最多有多少个类型，'<PAD>'也算一个
    # 将电影类型转换成等长的数字列表，长度为18
    genres_map = {val:[genres2int[row] for row in val.split('|')] for ii,val in enumerate(set(movies['Genres']))}
    for key in genres_map:
    #     print(genres_map[key])#[8, 9, 12]
        for cnt in range(max_genre_length - len(genres_map[key])):
            genres_map[key].insert(len(genres_map[key])+cnt,genres2int['<PAD>'])
    #     print(genres_map[key]) #[8, 9, 12, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
    movies['Genres'] = movies['Genres'].map(genres_map)
    
#     print(movies.head(1))
    
    #================================================评分数据处理相关================================================
    ratings_title = ['UserID','MovieID','ratings','timestamps'] # rating是作为训练输出的值
    ratings = pd.read_table('./datasets/ml-1m/ratings.dat',sep='::',header=None,names=ratings_title,engine='python')
    ratings = ratings.filter(regex='UserID|MovieID|ratings')
    
#     print(ratings.head(100))
    
    
    #================================================数据合并处理相关================================================
    #合并三个表
    data = pd.merge(pd.merge(ratings,users),movies)
    print(data.head(1))

    
    #将数据分成X和Y两张表
    target_fields = ['ratings']
    features_pd, targets_pd = data.drop(target_fields,axis=1),data[target_fields]
    
    features = features_pd.values
    targets_values = targets_pd.values
    
#     print(features.shape)
    
    #title_count：Title字段的长度（15）
    #title_set：Title文本的集合
    #genres2int：电影类型转数字的字典
    #features：是输入X
    #targets_values：是学习目标y
    #ratings：评分数据集的Pandas对象
    #users：用户数据集的Pandas对象
    #movies：电影数据的Pandas对象
    #data：三个数据集组合在一起的Pandas对象
    #movies_orig：没有做数据处理的原始电影数据
    #users_orig：没有做数据处理的原始用户数据
    return title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig


# 加载数据
title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = load_data()

# 存入文件中
pickle.dump((title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig), open('./datasets/preprocess.p', 'wb'))

   UserID  MovieID  ratings  Gender  Age  JobID  \
0       1     1193        5       0    0     10   

                                               Title  \
0  [930, 4566, 5136, 3249, 4918, 3847, 1538, 1538...   

                                              Genres  
0  [12, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...  


In [56]:
users.head()

Unnamed: 0,UserID,Gender,Age,JobID
0,1,0,0,10
1,2,1,5,16
2,3,1,6,15
3,4,1,2,7
4,5,1,6,20


In [57]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,"[2661, 2008, 2518, 2518, 2518, 2518, 2518, 251...","[9, 2, 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
1,2,"[5148, 2518, 2518, 2518, 2518, 2518, 2518, 251...","[8, 2, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
2,3,"[5196, 4005, 1372, 2518, 2518, 2518, 2518, 251...","[10, 13, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3..."
3,4,"[3725, 2039, 921, 2518, 2518, 2518, 2518, 2518...","[10, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
4,5,"[1592, 4628, 2018, 4647, 2270, 3370, 2518, 251...","[10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."


In [51]:
movies.values[0]

array([1,
       list([2661, 2008, 2518, 2518, 2518, 2518, 2518, 2518, 2518, 2518, 2518, 2518, 2518, 2518, 2518]),
       list([9, 2, 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])],
      dtype=object)

In [None]:
#数据预处理工作已经完成，下面就用预处理完的数据进行建模

In [24]:
title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = pickle.load(open('./datasets/preprocess.p', mode='rb'))

In [18]:
targets_values.shape

(1000209, 1)

In [15]:
data.head(100)#UserID	MovieID	ratings	Gender	Age	JobID	Title	Genres

Unnamed: 0,UserID,MovieID,ratings,Gender,Age,JobID,Title,Genres
0,1,1193,5,0,0,10,"[4876, 4503, 2025, 2343, 2092, 4977, 1614, 161...","[15, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,..."
1,2,1193,5,1,5,16,"[4876, 4503, 2025, 2343, 2092, 4977, 1614, 161...","[15, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,..."
2,12,1193,4,1,6,12,"[4876, 4503, 2025, 2343, 2092, 4977, 1614, 161...","[15, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,..."
3,15,1193,4,1,6,7,"[4876, 4503, 2025, 2343, 2092, 4977, 1614, 161...","[15, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,..."
4,17,1193,5,1,3,1,"[4876, 4503, 2025, 2343, 2092, 4977, 1614, 161...","[15, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,..."
...,...,...,...,...,...,...,...,...
95,329,1193,4,1,1,7,"[4876, 4503, 2025, 2343, 2092, 4977, 1614, 161...","[15, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,..."
96,331,1193,4,1,6,7,"[4876, 4503, 2025, 2343, 2092, 4977, 1614, 161...","[15, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,..."
97,332,1193,5,1,3,1,"[4876, 4503, 2025, 2343, 2092, 4977, 1614, 161...","[15, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,..."
98,333,1193,3,1,1,2,"[4876, 4503, 2025, 2343, 2092, 4977, 1614, 161...","[15, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,..."
