In [4]:
import pandas as pd
from IPython.display import display

# 1. 读取四个 CSV 数据集
movies_df = pd.read_csv('movies.csv')
links_df  = pd.read_csv('links.csv')
ratings_df = pd.read_csv('ratings.csv')
tags_df   = pd.read_csv('tags.csv')

# 预览各数据集（可选）
print("Movies 数据集样例：")
display(movies_df.head())
print("\nLinks 数据集样例：")
display(links_df.head())
print("\nRatings 数据集样例：")
display(ratings_df.head())
print("\nTags 数据集样例：")
display(tags_df.head())

# 2. 检查关键列的数据类型，确保 userId 与 movieId 在所有数据集中类型一致
# 这里假设它们应该都是整数类型
ratings_df['userId'] = ratings_df['userId'].astype(int)
ratings_df['movieId'] = ratings_df['movieId'].astype(int)
tags_df['userId'] = tags_df['userId'].astype(int)
tags_df['movieId'] = tags_df['movieId'].astype(int)

# 3. 重命名时间戳字段，避免合并时冲突
ratings_df.rename(columns={'timestamp': 'rating_timestamp'}, inplace=True)
tags_df.rename(columns={'timestamp': 'tag_timestamp'}, inplace=True)

# 4. 合并 movies 与 links（以 movieId 作为键，左连接保证 movies 信息完整）
movies_links_df = pd.merge(movies_df, links_df, on='movieId', how='left')
print("\n合并 movies 与 links 后的数据样例：")
display(movies_links_df.head())

# 5. 将 ratings 数据与 movies_links_df 合并，得到带有电影基本信息的评分数据
ratings_with_movies = pd.merge(ratings_df, movies_links_df, on='movieId', how='left')
print("\n合并 ratings 与 movies_links 后的数据样例：")
display(ratings_with_movies.head())

# 6. 预处理 tags 数据
# 由于同一用户对同一电影可能有多条 tag 记录，我们先按 (userId, movieId) 进行聚合，
# 将多个 tag 合并为一个字符串，多个 tag_timestamp 也合并为一个字符串（用逗号分隔）
agg_tags_df = tags_df.groupby(['userId', 'movieId'], as_index=False).agg({
    'tag': lambda x: ', '.join(x.dropna().astype(str)),
    'tag_timestamp': lambda x: ', '.join(x.dropna().astype(str))
})
print("\n聚合后的 tags 数据样例：")
display(agg_tags_df.head())

# 7. 最后，以 userId 和 movieId 为键，将聚合后的 tags 数据与 ratings_with_movies 合并
merged_dataset = pd.merge(ratings_with_movies, agg_tags_df, on=['userId','movieId'], how='left')
print("\n最终合并后的数据集样例：")
display(merged_dataset.head())

# 8. 根据示例中的格式调整列的顺序
# 假设目标顺序为：userId, movieId, rating, rating_timestamp, title, genres, imdbId, tmdbId, tag, tag_timestamp
desired_order = ['userId', 'movieId', 'rating', 'rating_timestamp', 'title', 'genres', 'imdbId', 'tmdbId', 'tag', 'tag_timestamp']
merged_dataset = merged_dataset[desired_order]
print("\n调整列顺序后的合并数据集样例：")
display(merged_dataset.head())

# 9. 保存最终合并的数据集到 CSV 文件
merged_dataset.to_csv('merged_dataset.csv', index=False)
print("\n合并后的数据集已保存到文件 merged_dataset.csv")


Movies 数据集样例：


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy



Links 数据集样例：


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0



Ratings 数据集样例：


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931



Tags 数据集样例：


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200



合并 movies 与 links 后的数据样例：


Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0



合并 ratings 与 movies_links 后的数据样例：


Unnamed: 0,userId,movieId,rating,rating_timestamp,title,genres,imdbId,tmdbId
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,113277,949.0
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,114369,807.0
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,114814,629.0



聚合后的 tags 数据样例：


Unnamed: 0,userId,movieId,tag,tag_timestamp
0,2,60756,"funny, Highly quotable, will ferrell","1445714994, 1445714996, 1445714992"
1,2,89774,"Boxing story, MMA, Tom Hardy","1445715207, 1445715200, 1445715205"
2,2,106782,"drugs, Leonardo DiCaprio, Martin Scorsese","1445715054, 1445715051, 1445715056"
3,7,48516,way too long,1169687325
4,18,431,"Al Pacino, gangster, mafia","1462138765, 1462138749, 1462138755"



最终合并后的数据集样例：


Unnamed: 0,userId,movieId,rating,rating_timestamp,title,genres,imdbId,tmdbId,tag,tag_timestamp
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,,
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,,
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,113277,949.0,,
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,114369,807.0,,
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,114814,629.0,,



调整列顺序后的合并数据集样例：


Unnamed: 0,userId,movieId,rating,rating_timestamp,title,genres,imdbId,tmdbId,tag,tag_timestamp
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,,
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,,
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,113277,949.0,,
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,114369,807.0,,
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,114814,629.0,,



合并后的数据集已保存到文件 merged_dataset.csv
