In [1]:
import pandas as pd

In [2]:
# 用户基本信息
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
user_df = pd.read_csv('./ml-1m/users.dat',
                      sep='::',
                      header=None,
                      names=unames,
                      engine='python')

In [2]:
# 电影信息
mnames = ['movie_id', 'title', 'genres']
movies_df = pd.read_csv('./ml-1m/movies.dat',
                        sep='::',
                        header=None,
                        names=mnames,
                        engine='python',
                        encoding='ISO-8859-1')

In [38]:
# 评分信息
rnames = ['user_id', 'movie_id', 'imdbId', 'timestamp']
ratings_df = pd.read_csv('./ml-1m/ratings.dat',
                         sep='::',
                         header=None,
                         engine='python',
                         names=rnames)

In [6]:
import re
patter = re.compile(r'^(.*)\((\d+)\)$')
title = {val:patter.match(val).group(1) for i,val in enumerate(set(movies_df['title']))}
movies_df['title'] = movies_df['title'].map(title)  
movies_df.head()


Unnamed: 0,movie_id,title,genres
0,1,Toy Story,Animation|Children's|Comedy
1,2,Jumanji,Adventure|Children's|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama
4,5,Father of the Bride Part II,Comedy


In [39]:
ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'],unit='s')
ratings_df.head()


Unnamed: 0,user_id,movie_id,imdbId,timestamp
0,1,1193,5,2000-12-31 22:12:40
1,1,661,3,2000-12-31 22:35:09
2,1,914,3,2000-12-31 22:32:48
3,1,3408,4,2000-12-31 22:04:35
4,1,2355,5,2001-01-06 23:38:11


In [41]:
ratings_df.rename(columns={'timestamp':'time'}, inplace=True)
ratings_df.tail()


Unnamed: 0,user_id,movie_id,imdbId,time
1000204,6040,1091,1,2000-04-26 02:35:41
1000205,6040,1094,5,2000-04-25 23:21:27
1000206,6040,562,5,2000-04-25 23:19:06
1000207,6040,1096,4,2000-04-26 02:20:48
1000208,6040,1097,4,2000-04-26 02:19:29


In [43]:
import datetime

# 将 'time' 列转换为日期格式
ratings_df['time'] = pd.to_datetime(ratings_df['time'])

# 将'time'列格式化为所需的字符串格式
ratings_df['time'] = ratings_df['time'].dt.strftime('%Y-%m-%d')

print(ratings_df.head())

   user_id  movie_id  imdbId        time
0        1      1193       5  2000-12-31
1        1       661       3  2000-12-31
2        1       914       3  2000-12-31
3        1      3408       4  2000-12-31
4        1      2355       5  2001-01-06


In [45]:
# 子数据合并
movies_ratings_df = pd.merge(ratings_df,movies_df,on='movie_id')
movies_ratings_df.head()


Unnamed: 0,user_id,movie_id,imdbId,time,title,genres
0,1,1193,5,2000-12-31,One Flew Over the Cuckoo's Nest,Drama
1,2,1193,5,2000-12-31,One Flew Over the Cuckoo's Nest,Drama
2,12,1193,4,2000-12-30,One Flew Over the Cuckoo's Nest,Drama
3,15,1193,4,2000-12-30,One Flew Over the Cuckoo's Nest,Drama
4,17,1193,5,2000-12-30,One Flew Over the Cuckoo's Nest,Drama


In [46]:
movies_ratings_user_df = pd.merge(pd.merge(ratings_df,movies_df),user_df)
movies_ratings_user_df.head()


Unnamed: 0,user_id,movie_id,imdbId,time,title,genres,gender,age,occupation,zip
0,1,1193,5,2000-12-31,One Flew Over the Cuckoo's Nest,Drama,F,1,10,48067
1,1,661,3,2000-12-31,James and the Giant Peach,Animation|Children's|Musical,F,1,10,48067
2,1,914,3,2000-12-31,My Fair Lady,Musical|Romance,F,1,10,48067
3,1,3408,4,2000-12-31,Erin Brockovich,Drama,F,1,10,48067
4,1,2355,5,2001-01-06,"Bug's Life, A",Animation|Children's|Comedy,F,1,10,48067


In [47]:
# 将DataFrame保存为CSV文件
movies_ratings_user_df.to_csv('movies_ratings_user.csv', index=False)

In [49]:
for col in movies_ratings_user_df.columns:
    print(f"列名: {col}, 数据类型: {movies_ratings_user_df[col].dtype}")

列名: user_id, 数据类型: int64
列名: movie_id, 数据类型: int64
列名: imdbId, 数据类型: int64
列名: time, 数据类型: object
列名: title, 数据类型: object
列名: genres, 数据类型: object
列名: gender, 数据类型: object
列名: age, 数据类型: int64
列名: occupation, 数据类型: int64
列名: zip, 数据类型: object
