In [1]:

import os
import pandas as pd

# dataset path
dir_path = r'E:\DataSet\DataSet\RecommendationSystem\MovieLens\ml-1m'
user_path = os.path.join(dir_path, 'users.dat')
movies_path = os.path.join(dir_path, 'movies.dat')
ratings_path = os.path.join(dir_path, 'ratings.dat')


Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [None]:
# load user dataset
user_columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']  # 用户ID，性别、年龄、职业ID、邮编
user_pd = pd.read_csv(user_path, sep='::', header=None, names=user_columns, engine='python')
user_pd.head()

In [3]:
# load movie dataset

movie_columns = ['MovieID', 'Title', 'Genres']  # 电影ID， 电影名， 电影分类
movie_pd = pd.read_csv(movies_path, sep='::', header=None, names=movie_columns, engine='python')
movie_pd.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# load rating dataset
ratings_columns = ['UserID', 'MovieID', 'rating', 'timestamps']
rating_pd = pd.read_csv(ratings_path, sep='::', header=None, names=ratings_columns, engine='python')
rating_pd.head()

Unnamed: 0,UserID,MovieID,rating,timestamps
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
# explore user data
user_gender = list(user_pd['Gender'].unique())
user_gender

['F', 'M']

In [6]:
user_age = list(user_pd['Age'].unique())
user_age

[1, 56, 25, 45, 50, 35, 18]

In [7]:
# user data preprocess
gender_map  = {'F': 0, 'M': 1}
age_map = {value: index for index, value in enumerate(list(user_pd['Age'].unique()))}
# age_map
# {1: 0, 56: 1, 25: 2, 45: 3, 50: 4, 35: 5, 18: 6

{1: 0, 56: 1, 25: 2, 45: 3, 50: 4, 35: 5, 18: 6}

In [8]:
user_pd['Gender'] = user_pd['Gender'].map(gender_map)
user_pd['Age'] = user_pd['Age'].map(age_map)

user_pd.head()


Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,1,0,10,48067
1,2,0,1,16,70072
2,3,0,2,15,55117
3,4,0,3,7,2460
4,5,0,2,20,55455


In [17]:
import re

pattern = re.compile(r'^(.*)\((\d+)\)$')
# group(0): 原始数据 group(1) :第一个匹配的数据
title_map = {val: pattern.match(val).group(1) for index, val in enumerate(set(movie_pd['Title']))}
movie_pd['Title'] = movie_pd['Title'].map(title_map)
# print(pattern.match('Grumpier Old Men (1995)').group(1))
# 电影类型处理成字典








AttributeError: 'NoneType' object has no attribute 'group'

In [24]:
genres_set = set()
for val in movie_pd['Genres'].str.split('|'):
    genres_set.update(val)

genres_set.add('<PAD>')  # 所有类型字段的集合
genres2int = {val: index for index, val in enumerate(genres_set)}

print('单一电影类型总数', len(genres2int.keys()))
# 19

#将电影类型转成等长数字列表，长度是18
genres_map = {value: [genres2int[row] for row in value.split('|')] for index,  value in enumerate(set(movie_pd['Genres']))}

print('电影类型总数',len(list(genres_map.keys())))
# 301

for key in genres_map:
    for cnt in range(max(genres2int.values()) - len(genres_map[key])):
        # genres_map[key]是一个数组，用insert在指定位置插入值（用<PAD>对应的数字填充）类似于独热编码
        genres_map[key].insert(len(genres_map[key]) + cnt, genres2int['<PAD>'])


单一电影类型总数 19
电影类型总数 301
Action|Drama|Sci-Fi|Thriller [17, 5, 1, 9, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
Comedy|Film-Noir|Thriller [15, 0, 9, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
Adventure|Animation|Film-Noir [12, 2, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
Action|Romance|Sci-Fi [17, 18, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
Musical|Romance|War [10, 18, 14, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]


In [30]:

title_set = set()
for val in movie_pd['Title'].str.split():  # 空格分隔符
    title_set.update(val)

title_set.add('<PAD>')

#将电影名称转成等长数字列表，长度是18
title2int = {val: index for index, val in enumerate(title_set)}

# Buddy  :  0
# Walk  :  1
# Theater  :  2
# Instinct  :  3

title_map = {val: [title2int[row] for row in val.split()] for index, val in enumerate(set(movie_pd['Title']))}

title_count = 15
for key in title_map:
    for cnt in range(title_count - len(title_map[key])):
        title_map[key].insert(len(title_map[key]) + cnt, title2int['<PAD>'])

# Big Chill, The   :  [721, 1806, 1305, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972]
# Blame It on Rio   :  [2355, 123, 784, 4818, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972]
# Waterboy, The   :  [4670, 1305, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972]
# Miss Julie   :  [3912, 126, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972]
# Shakes the Clown   :  [2700, 2641, 421, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972, 4972]

movie_pd['Title'] = movie_pd['Title'].map(title_map)

In [34]:

# 合并三张表
data = pd.merge(pd.merge(rating_pd, user_pd), movie_pd)
data.columns
target_fields = ['rating']
features_pd, targets_pd = data.drop(target_fields, axis=1), data[target_fields]

print(features_pd.head())
print(targets_pd.head())

   UserID  MovieID  timestamps  Gender  Age  Occupation Zip-code  \
0       1     1193   978300760       1    0          10    48067   
1       2     1193   978298413       0    1          16    70072   
2      12     1193   978220179       0    2          12    32793   
3      15     1193   978199279       0    2           7    22903   
4      17     1193   978158471       0    4           1    95350   

                                               Title Genres  
0  [3881, 5057, 1696, 2641, 1292, 2400, 4972, 497...  Drama  
1  [3881, 5057, 1696, 2641, 1292, 2400, 4972, 497...  Drama  
2  [3881, 5057, 1696, 2641, 1292, 2400, 4972, 497...  Drama  
3  [3881, 5057, 1696, 2641, 1292, 2400, 4972, 497...  Drama  
4  [3881, 5057, 1696, 2641, 1292, 2400, 4972, 497...  Drama  
   rating
0       5
1       5
2       4
3       4
4       5


In [36]:
print(user_pd.head())
movie_pd['Genres'] = movie_pd['Genres'].map(genres_map)
print(movie_pd.head())



   UserID  Gender  Age  Occupation Zip-code
0       1       1    0          10    48067
1       2       0    1          16    70072
2       3       0    2          15    55117
3       4       0    3           7    02460
4       5       0    2          20    55455
   MovieID                                              Title  \
0        1  [1963, 865, 4972, 4972, 4972, 4972, 4972, 4972...   
1        2  [3701, 4972, 4972, 4972, 4972, 4972, 4972, 497...   
2        3  [624, 3415, 1092, 4972, 4972, 4972, 4972, 4972...   
3        4  [441, 640, 3621, 4972, 4972, 4972, 4972, 4972,...   
4        5  [3928, 2025, 2641, 3730, 4210, 3401, 4972, 497...   

                                              Genres  
0  [2, 16, 15, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6...  
1  [12, 16, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6...  
2  [15, 18, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6...  
3  [15, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,...  
4  [15, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,...  
