In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter
import tensorflow as tf

import os
import pickle
import re
from tensorflow.python.ops import math_ops

In [4]:
# 读取用户数据：UserID，性别，年龄，职业ID，邮编’
#年龄数字代表的是年龄段：1: "18岁以下"；18: "18-24"；25: "25-34"；35: "35-44"；45: "45-49"；50: "50-55"；56: "56+"
# 每个职业ID对应一种职业
# 这里我们用不到邮编数据
users_title = ['UserID','Gender','Age','JobID','Zip-code']
users = pd.read_table('./ml-1m/users.dat',sep='::',header=None,names=users_title,engine='python')
users = users.filter(regex='UserID|Gender|Age|JobID') 
users_orig = users.values # 这个是将users的Value值转换成numberArray的数据
users.head(3)

Unnamed: 0,UserID,Gender,Age,JobID
0,1,F,1,10
1,2,M,56,16
2,3,M,25,15


In [5]:
# 将Gender中的'F'变为0，'M'变为1
gender_map = {'F':0,'M':1}
users['Gender'] = users['Gender'].map(gender_map)
users['Gender']

0       0
1       1
2       1
3       1
4       1
       ..
6035    0
6036    0
6037    0
6038    0
6039    1
Name: Gender, Length: 6040, dtype: int64

In [89]:
set([18,23,56,47,56,27,18,23]) # 速出为{18, 23, 27, 47, 56}一个有序且不重复的数据集合
for ii ,val in enumerate({18, 23, 27, 47, 56}): # 会遍历每个元素，并且ii为此元素的位置，val为此元素的值
    print("位置为:%d的元素是%d"%(ii,val))

位置为:0的元素是47
位置为:1的元素是18
位置为:2的元素是23
位置为:3的元素是56
位置为:4的元素是27


In [68]:
for ii,val in enumerate(set(users['Age'])):
    if ii == 0:
        print(ii,"的值有：",val)
# 

0 的值有： 1


In [16]:
# 读取电影数据:电影ID，电影名称，电影风格
# MovieID是类别字段，Title是文本，Genres也是类别字段
movies_title = ['MovieID','Title','Genres']
movies = pd.read_table('./ml-1m/movies.dat',sep='::',header=None,names=movies_title,engine='python')
movies_orig = movies.values
movies.head(3)

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [17]:
#compile 函数用于编译正则表达式，生成一个正则表达式（ Pattern ）对象，供 match() 和 search() 这两个函数使用。
pattern = re.compile(r'^(.*)\((\d+)\)$') # 相当于把最后一个括号前的字符串当做group(1),最后一个括号里面的数字当做group(2)
m = pattern.match("(1995)Toy Story (1995)") # m的值为<re.Match object; span=(0, 22), match='(1995)Toy Story (1995)'>
m.group(0) # '(1995)Toy Story (1995)'
m.group(1) # '(1995)Toy Story '
m.group(2) # '1995'

'1995'

In [18]:
#将Movies中Title的年份去掉
pattern = re.compile(r'(.*)\((\d+)\)$') # '(第一个字符串)((第二个字符串左右两边要有括号且在最后一个货号内))'
title_map1 = {val:pattern.match(val).group(1) for ii,val in enumerate(set(movies['Title']))}
movies['Title'] = movies['Title'].map(title_map1)
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story,Animation|Children's|Comedy
1,2,Jumanji,Adventure|Children's|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama
4,5,Father of the Bride Part II,Comedy


In [5]:
# 将title转换成数字字典，既用表征向量表示
title_set = set()
for val in movies['Title'].str.split(): # 将名字拆分成字符串数组：如[Toy, Story]；
#     print(val) # 每一个val对应一个['Grumpier', 'Old', 'Men']
    title_set.update(val) # 将val中没有在title_set中的单词添加到title_set中
title_set.add('<PAD>')
# 将电影标题分割的词，转换成了字典的形式
title2int = {val:ii for ii,val in enumerate(title_set)}#{'Takes': 0, 'Woke': 1,'Midnight': 2,...}
title2int

{'Mulan': 0,
 'Endgame': 1,
 'Terrorist,': 2,
 'Cube': 3,
 "Lilian's": 4,
 '(Longxiong': 5,
 'Aliens': 6,
 '34th': 7,
 'Being,': 8,
 'Wars:': 9,
 'Left': 10,
 'Jr.': 11,
 'Thirty-Two': 12,
 'Joyriders,': 13,
 'Cousin': 14,
 'Hard': 15,
 'ce': 16,
 'Dwarf,': 17,
 'Balto': 18,
 '(Un': 19,
 'Richer': 20,
 'Bambi': 21,
 'Fletch': 22,
 'Falcon,': 23,
 'al': 24,
 'Super': 25,
 'Stones': 26,
 'Jones': 27,
 'Romancing': 28,
 'Games': 29,
 'marchait': 30,
 'Stein)': 31,
 'Harvest': 32,
 'Presents:': 33,
 'Slingshot,': 34,
 'Gentleman,': 35,
 'Breathless': 36,
 'Mifune': 37,
 'Desire': 38,
 'A.E.': 39,
 'Annie': 40,
 'Wore': 41,
 'Ennui,': 42,
 'Hitch-Hiker,': 43,
 'Macao': 44,
 'Only': 45,
 'Conflict,': 46,
 'Defying': 47,
 'Limelight': 48,
 'Singles': 49,
 'Fearless': 50,
 'Illusion': 51,
 'JLG/JLG': 52,
 '(Pret-A-Porter)': 53,
 'Simon': 54,
 '2000': 55,
 'Bob?': 56,
 'Somewhere': 57,
 'League:': 58,
 'mir': 59,
 "Smilla's": 60,
 'Catfish': 61,
 '(Se7en)': 62,
 'chat)': 63,
 'Festival': 64,
 '

In [6]:
for ii ,val in enumerate(set(movies['Title'])):
    print(val)
# 输出结果如下
#Close Shave, A 
# Return to Oz 
# Funhouse, The 
# Children of Paradise (Les enfants du paradis) 
# What Lies Beneath 

Muse, The 
Batman & Robin 
Betrayed 
Repossessed 
Mediterraneo 
Down in the Delta 
Chicken Run 
Fletch Lives 
Tickle in the Heart, A 
Bloody Child, The 
Newton Boys, The 
Across the Sea of Time 
Plunkett & MaCleane 
Pump Up the Volume 
Welcome to the Dollhouse 
General's Daughter, The 
Nadja 
Bliss 
Of Human Bondage 
Above the Rim 
Cowboy Way, The 
Strange Days 
Outbreak 
Amadeus 
Kronos 
Odd Couple, The 
Big Blue, The (Le Grand Bleu) 
Telling You 
Pelican Brief, The 
Cold Fever (� k�ldum klaka) 
House 
Hostile Intentions 
Three Wishes 
Trading Places 
Halloween 
Boiling Point 
Rosie 
Nutty Professor, The 
Eyes of Laura Mars 
Three Colors: White 
Star Trek: Insurrection 
Day the Earth Stood Still, The 
Searchers, The 
Scout, The 
Little City 
Hi-Lo Country, The 
Tom Jones 
Night Tide 
Duets 
Big Bang Theory, The 
Gridlock'd 
Star Trek: The Motion Picture 
Vermin 
Love and Basketball 
Ride with the Devil 
Village of the Damned 
Only You 
Wirey Spindell 
Burnt Offerings 
My Name Is Joe 


Beautiful 
Ready to Wear (Pret-A-Porter) 
Man of Her Dreams 
Them! 
Conquest of the Planet of the Apes 
Better Living 
Cruel Intentions 
Messenger: The Story of Joan of Arc, The 
Chinatown 
War at Home, The 
Drunks 
Spiders, The (Die Spinnen, 1. Teil: Der Goldene See) 
Cat People 
Dear God 
Baby-Sitters Club, The 
Gabbeh 
Clueless 
Great Race, The 
Indiana Jones and the Temple of Doom 
Human Traffic 
Blackbeard's Ghost 
Stonewall 
Great Day in Harlem, A 
Alligator 
Invasion of the Body Snatchers 
Meet the Parents 
Agnes of God 
Live Flesh 
Hillbillys in a Haunted House 
Stepford Wives, The 
Friday 
Champagne 
Color of Night 
Striptease 
Wallace & Gromit: The Best of Aardman Animation 
Crows and Sparrows 
For Whom the Bell Tolls 
Band Wagon, The 
Alice in Wonderland 
Heaven 
Man of the Century 
Clay Pigeons 
Kissing a Fool 
In the Heat of the Night 
Santitos 
Color of Paradise, The (Rang-e Khoda) 
Blade 
Bear, The 
Fifth Element, The 
Hellhounds on My Trail 
Dersu Uzala 
Jennifer 8 
Esc

In [19]:
for row in "Children of Paradise (Les enfants du paradis)".split():
    print(row,":",title2int[row])

Children : 3788
of : 4180
Paradise : 4440
(Les : 3448
enfants : 1501
du : 4838
paradis) : 3185


In [7]:
# 将电影Title转成等长的数字列表，长度15
title_count = 15
# title循环中的val为：“Close Shave, A”  ,tile2int[row]为每一个key为‘Close’，‘Shave,’，‘A’时在title2int中的value(一个数字)
# 最后的val是一组数字组成的数组
title_map2 = {val:[title2int[row] for row in val.split()] for ii ,val in enumerate(set(movies['Title']))}

for key in title_map2: # key就是这样的每一个"Close Shave, A","Return to Oz "电影的名称
    # print(title) # "Close Shave, A"
    # print(title_map2[key]) # [4066, 3272, 3331]
    for cnt in range(title_count-len(title_map2[key])):
        # print(cnt) # 0,1,2这样的数字
        #print(title2int['<PAD>']) #2412
        title_map2[key].insert(len(title_map2[key])+cnt,title2int['<PAD>'])
    # print(title_map2[key]) # [4066, 3272, 3331, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412]
    # 每一次循环就是将不到长度不到15的整型数组补全至15，不到的地方用title2int['<PAD>']=2412补上

movies['Title'] = movies['Title'].map(title_map2)
movies.head(1)

Unnamed: 0,MovieID,Title,Genres
0,1,"[2661, 2008, 2518, 2518, 2518, 2518, 2518, 251...",Animation|Children's|Comedy


In [19]:
# 将电影类型转换成数字字典
genres_set = set()
for val in movies['Genres'].str.split('|'):
    genres_set.update(val)
genres_set.add('<PAD>')
genres2int = {val:ii for ii,val in enumerate(genres_set)}

max_genre_length = max(genres2int.values()) # 最多有多少个类型，'<PAD>'也算一个
# 将电影类型转换成等长的数字列表，长度为18
genres_map = {val:[genres2int[row] for row in val.split('|')] for ii,val in enumerate(set(movies['Genres']))}
for key in genres_map:
#     print(genres_map[key])#[8, 9, 12]
    for cnt in range(max_genre_length - len(genres_map[key])):
        genres_map[key].insert(len(genres_map[key])+cnt,genres2int['<PAD>'])
#     print(genres_map[key]) #[8, 9, 12, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
movies['Genres'] = movies['Genres'].map(genres_map)
movies.head(1)

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story,"[9, 2, 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."


In [23]:
#读取评分数据：用户ID，电影ID，评分，时间戳
# 用户ID取值在1-6040
# 电影ID取值在1-3952
# 评分采用5星制：只有整数星级
# 时间采用以秒为单位，从1970-01-01 00:00:00（UTC 为标准的时间）开始算：当然这里我们用不到时间戳字段
# 每一个用户至少需要用20条评分
ratings_title = ['UserID','MovieID','Rating','timestamps']
ratings = pd.read_table('./ml-1m/ratings.dat',sep='::',header=None,names=ratings_title,engine='python')
ratings = ratings.filter(regex='UserID|MovieID|Rating')
ratings.head(100)

Unnamed: 0,UserID,MovieID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
95,2,2490,3
96,2,1834,4
97,2,3471,5
98,2,589,4


In [36]:
# 下面要对数据进行一些预处理操作
# UserID,职业ID和MovieID不用变
# Age字段，转换成7个连续的数字表示0-6：每个代表一个年龄段
# 电影风格Genres字段要转换成数字：首先将电影风格中的每种风格对应的描述词语在One-Hot表示的字典中去查询出来，
# 每个风格单词就对应一个One-Hot表征向量，把所有风格的One-Hot表征向量相加就是这个电影的风格组合描述
# 电影名称的title字段处理方式和电影风格类似，（另外title中的年份需要去掉，年份不应该加到标题特征中）
# 电影风格和电影名称要使用同一个one-hot词典中表示成相同长度的表征向量，方便在神经网络中处理；空白部分用'<PAD>'对应的数字填充
def load_data():
    #================================================User数据处理相关================================================
    #读取User数据
    users_title = ['UserID','Gender','Age','JobID','Zip-code']
    users = pd.read_table('./ml-1m/users.dat',sep='::',header=None,names=users_title,engine='python')
    users = users.filter(regex = 'UserID|Gender|Age|JobID')
    users_orig = users.values
    
    # 改变User数据中的性别和年龄
    gender_map = {'F':0,'M':1}
    users['Gender'] = users['Gender'].map(gender_map)
    
    age_map = {val:ii for ii,val in enumerate(set(users['Age']))}
    users['Age'] = users['Age'].map(age_map)
    #     print(users.head(3))
    
    #================================================Movie数据处理相关================================================
    #读取Movie数据集
    movies_title = ['MovieID', 'Title', 'Genres']
    movies = pd.read_table('./ml-1m/movies.dat',sep='::',header=None,names=movies_title,engine='python')
    movies_orig = movies.values
    
    #将title中的年份去掉
    pattern = re.compile(r'(.*)\((\d+)\)$') # '(第一个字符串)((第二个字符串左右两边要有括号且在最后一个货号内))'
    
    title_map = {val:pattern.match(val).group(1) for ii,val in enumerate(set(movies['Title']))}
    movies['Title'] = movies['Title'].map(title_map)
    
    #将电影名称转成数字字典中的向量
    # 将title转换成数字字典，既用表征向量表示
    title_set = set()
    for val in movies['Title'].str.split(): # 将名字拆分成字符串数组：如[Toy, Story]；
    #     print(val) # 每一个val对应一个['Grumpier', 'Old', 'Men']
        title_set.update(val) # 将val中没有在title_set中的单词添加到title_set中
    title_set.add('<PAD>')
    # 将电影标题分割的词，转换成了字典的形式
    title2int = {val:ii for ii,val in enumerate(title_set)}#{'Takes': 0, 'Woke': 1,'Midnight': 2,...}
    
    # 将电影Title转成等长的数字列表，长度15
    title_count = 15
    # title循环中的val为：“Close Shave, A”  ,tile2int[row]为每一个key为‘Close’，‘Shave,’，‘A’时在title2int中的value(一个数字)
    # 最后的val是一组数字组成的数组
    title_map2 = {val:[title2int[row] for row in val.split()] for ii ,val in enumerate(set(movies['Title']))}

    for key in title_map2: # key就是这样的每一个"Close Shave, A","Return to Oz "电影的名称
        # print(title) # "Close Shave, A"
        # print(title_map2[key]) # [4066, 3272, 3331]
        for cnt in range(title_count-len(title_map2[key])):
            # print(cnt) # 0,1,2这样的数字
            #print(title2int['<PAD>']) #2412
            title_map2[key].insert(len(title_map2[key])+cnt,title2int['<PAD>'])
        # print(title_map2[key]) # [4066, 3272, 3331, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412, 2412]
        # 每一次循环就是将不到长度不到15的整型数组补全至15，不到的地方用title2int['<PAD>']=2412补上
    movies['Title'] = movies['Title'].map(title_map2)
    
    
    # 将电影类型转换成数字字典
    genres_set = set()
    for val in movies['Genres'].str.split('|'):
        genres_set.update(val)
    genres_set.add('<PAD>')
    genres2int = {val:ii for ii,val in enumerate(genres_set)}

    max_genre_length = max(genres2int.values()) # 最多有多少个类型，'<PAD>'也算一个
    # 将电影类型转换成等长的数字列表，长度为18
    genres_map = {val:[genres2int[row] for row in val.split('|')] for ii,val in enumerate(set(movies['Genres']))}
    for key in genres_map:
    #     print(genres_map[key])#[8, 9, 12]
        for cnt in range(max_genre_length - len(genres_map[key])):
            genres_map[key].insert(len(genres_map[key])+cnt,genres2int['<PAD>'])
    #     print(genres_map[key]) #[8, 9, 12, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
    movies['Genres'] = movies['Genres'].map(genres_map)
    
#     print(movies.head(1))
    
    #================================================评分数据处理相关================================================
    ratings_title = ['UserID','MovieID','ratings','timestamps'] # rating是作为训练输出的值
    ratings = pd.read_table('./ml-1m/ratings.dat',sep='::',header=None,names=ratings_title,engine='python')
    ratings = ratings.filter(regex='UserID|MovieID|ratings')
    
#     print(ratings.head(100))
    
    
    #================================================数据合并处理相关================================================
    #合并三个表
    data = pd.merge(pd.merge(ratings,users),movies)
    print(data.head(1))

    
    #将数据分成X和Y两张表
    target_fields = ['ratings']
    features_pd, targets_pd = data.drop(target_fields,axis=1),data[target_fields]
    
    features = features_pd.values
    targets_values = targets_pd.values
    
    print(features.shape)
    
    #title_count：Title字段的长度（15）
    #title_set：Title文本的集合
    #genres2int：电影类型转数字的字典
    #features：是输入X
    #targets_values：是学习目标y
    #ratings：评分数据集的Pandas对象
    #users：用户数据集的Pandas对象
    #movies：电影数据的Pandas对象
    #data：三个数据集组合在一起的Pandas对象
    #movies_orig：没有做数据处理的原始电影数据
    #users_orig：没有做数据处理的原始用户数据
    return title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig


load_data()

   UserID  MovieID  ratings  Gender  Age  JobID  \
0       1     1193        5       0    0     10   

                                               Title  \
0  [4236, 185, 4754, 2018, 4983, 3023, 2518, 2518...   

                                              Genres  
0  [6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...  
(1000209, 7)
