In [2]:
import numpy as np
import pandas as pd

In [16]:
# 读取用户数据
users = pd.read_csv(
    "ml-1m/users.dat",  # 用户数据文件路径
    sep="::",  # 分隔符为双冒号
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],  # 列名
    encoding="ISO-8859-1",  # 使用ISO-8859-1编码
    engine="python",  # 使用Python解析引擎
)
users.head()

Unnamed: 0,user_id,sex,age_group,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [17]:
# 读取评分数据
ratings = pd.read_csv(
    "ml-1m/ratings.dat",  # 评分数据文件路径
    sep="::",  # 分隔符为双冒号
    names=["user_id", "movie_id", "rating", "unix_timestamp"],  # 列名
    encoding="ISO-8859-1",  # 使用ISO-8859-1编码
    engine="python",  # 使用Python解析引擎
)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [18]:
# 读取电影数据
movies = pd.read_csv(
    "ml-1m/movies.dat",  # 电影数据文件路径
    sep="::",  # 分隔符为双冒号
    names=["movie_id", "title", "genres"],  # 列名
    encoding="ISO-8859-1",  # 使用ISO-8859-1编码
    engine="python",  # 使用Python解析引擎
)
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [19]:
# 给用户数据添加user_id前缀
users["user_id"] = users["user_id"].apply(lambda x: f"user_{x}")

# 给用户数据添加age_group前缀
users["age_group"] = users["age_group"].apply(lambda x: f"group_{x}")

# 给用户数据添加occupation前缀
users["occupation"] = users["occupation"].apply(lambda x: f"occupation_{x}")

# 给电影数据添加movie_id前缀
movies["movie_id"] = movies["movie_id"].apply(lambda x: f"movie_{x}")

# 给评分数据添加movie_id前缀
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: f"movie_{x}")

# 给评分数据添加user_id前缀
ratings["user_id"] = ratings["user_id"].apply(lambda x: f"user_{x}")

# 将评分数据中的rating转换为浮点型
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

# 定义电影类型列表
genres = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime"]
genres += ["Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical"]
genres += ["Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

# 遍历电影类型列表
for genre in genres:
    # 对于每个电影类型，将movies["genres"]中的每个电影的类型字符串进行处理
    # 使用lambda函数将字符串转换为对应的二进制值（1表示包含该类型，0表示不包含该类型）
    movies[genre] = movies["genres"].apply(
        lambda values: int(genre in values.split("|"))
    )

# 按照"unix_timestamp"列对"ratings"数据集进行排序，并按"user_id"分组
ratings_group = ratings.sort_values(by=["unix_timestamp"]).groupby("user_id")

# 创建一个新的数据框ratings_data，包含以下列：user_id, movie_ids, ratings, timestamps
ratings_data = pd.DataFrame(
    data={
        "user_id": list(ratings_group.groups.keys()),  # 获取分组后的用户ID
        "movie_ids": list(ratings_group.movie_id.apply(list)),  # 获取每个用户对应的电影ID列表
        "ratings": list(ratings_group.rating.apply(list)),  # 获取每个用户对应的评分列表
        "timestamps": list(ratings_group.unix_timestamp.apply(list)),  # 获取每个用户对应的时间戳列表
    }
)
ratings_data.head()

Unnamed: 0,user_id,movie_ids,ratings,timestamps
0,user_1,"[movie_3186, movie_1721, movie_1270, movie_102...","[4.0, 4.0, 5.0, 5.0, 3.0, 5.0, 4.0, 4.0, 5.0, ...","[978300019, 978300055, 978300055, 978300055, 9..."
1,user_10,"[movie_597, movie_858, movie_743, movie_1210, ...","[4.0, 3.0, 3.0, 4.0, 4.0, 5.0, 5.0, 5.0, 3.0, ...","[978224375, 978224375, 978224375, 978224400, 9..."
2,user_100,"[movie_260, movie_1676, movie_1198, movie_541,...","[4.0, 3.0, 4.0, 3.0, 4.0, 3.0, 1.0, 1.0, 5.0, ...","[977593595, 977593595, 977593607, 977593624, 9..."
3,user_1000,"[movie_971, movie_260, movie_2990, movie_2973,...","[4.0, 5.0, 4.0, 3.0, 5.0, 5.0, 2.0, 5.0, 5.0, ...","[975040566, 975040566, 975040566, 975040629, 9..."
4,user_1001,"[movie_1198, movie_1617, movie_2885, movie_390...","[4.0, 4.0, 4.0, 2.0, 2.0, 1.0, 4.0, 5.0, 5.0, ...","[975039591, 975039702, 975039702, 975039898, 9..."


In [20]:
# 定义窗口大小和步长
sequence_length = 4
step_size = 2

# 创建序列函数，输入值、窗口大小和步长，返回序列列表
def create_sequences(values, window_size, step_size):
    sequences = []  # 存储序列的列表
    start_index = 0  # 起始索引
    while True:
        end_index = start_index + window_size  # 结束索引
        seq = values[start_index:end_index]  # 根据窗口大小切片得到序列
        if len(seq) < window_size:  # 如果序列长度小于窗口大小
            seq = values[-window_size:]  # 则取最后窗口大小长度的序列
            if len(seq) == window_size:  # 如果序列长度等于窗口大小
                sequences.append(seq)  # 将序列添加到列表中
            break  # 结束循环
        sequences.append(seq)  # 将序列添加到列表中
        start_index += step_size  # 更新起始索引
    return sequences  # 返回序列列表

# 对电影ID列应用create_sequences函数，将结果赋值给movie_ids列
ratings_data.movie_ids = ratings_data.movie_ids.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

# 对评分列应用create_sequences函数，将结果赋值给ratings列
ratings_data.ratings = ratings_data.ratings.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

# 删除timestamps列
del ratings_data["timestamps"]

ratings_data

Unnamed: 0,user_id,movie_ids,ratings
0,user_1,"[[movie_3186, movie_1721, movie_1270, movie_10...","[[4.0, 4.0, 5.0, 5.0], [5.0, 5.0, 3.0, 5.0], [..."
1,user_10,"[[movie_597, movie_858, movie_743, movie_1210]...","[[4.0, 3.0, 3.0, 4.0], [3.0, 4.0, 4.0, 5.0], [..."
2,user_100,"[[movie_260, movie_1676, movie_1198, movie_541...","[[4.0, 3.0, 4.0, 3.0], [4.0, 3.0, 4.0, 3.0], [..."
3,user_1000,"[[movie_971, movie_260, movie_2990, movie_2973...","[[4.0, 5.0, 4.0, 3.0], [4.0, 3.0, 5.0, 5.0], [..."
4,user_1001,"[[movie_1198, movie_1617, movie_2885, movie_39...","[[4.0, 4.0, 4.0, 2.0], [4.0, 2.0, 2.0, 1.0], [..."
...,...,...,...
6035,user_995,"[[movie_1894, movie_260, movie_247, movie_433]...","[[2.0, 4.0, 5.0, 3.0], [5.0, 3.0, 3.0, 4.0], [..."
6036,user_996,"[[movie_1347, movie_2146, movie_1961, movie_27...","[[4.0, 3.0, 5.0, 3.0], [5.0, 3.0, 5.0, 5.0], [..."
6037,user_997,"[[movie_1196, movie_2082, movie_3247, movie_24...","[[4.0, 3.0, 3.0, 3.0], [3.0, 3.0, 2.0, 5.0], [..."
6038,user_998,"[[movie_2266, movie_1264, movie_1097, movie_16...","[[3.0, 4.0, 5.0, 5.0], [5.0, 5.0, 4.0, 3.0], [..."


In [21]:
# 使每个序列在DataFrame中成为单独的记录

# 将ratings_data中的"movie_ids"列拆分成多行，每行只包含一个电影ID，并重置索引
ratings_data_movies = ratings_data[["user_id", "movie_ids"]].explode("movie_ids", ignore_index=True)

# 将ratings_data中的"ratings"列拆分成多行，每行只包含一个评分，并重置索引
ratings_data_rating = ratings_data[["ratings"]].explode("ratings", ignore_index=True)

# 将拆分后的"movie_ids"和"ratings"两列合并为一个DataFrame
ratings_data_transformed = pd.concat([ratings_data_movies, ratings_data_rating], axis=1)

# 根据"user_id"列将ratings_data_transformed与users进行连接
ratings_data_transformed = ratings_data_transformed.join(users.set_index("user_id"), on="user_id")

# 将"movie_ids"列中的每个元素转换为字符串，并用逗号分隔
ratings_data_transformed.movie_ids = ratings_data_transformed.movie_ids.apply(lambda x: ",".join(x))

# 将"ratings"列中的每个元素转换为字符串，并用逗号分隔
ratings_data_transformed.ratings = ratings_data_transformed.ratings.apply(lambda x: ",".join([str(v) for v in x]))

# 删除ratings_data_transformed中的"zip_code"列
del ratings_data_transformed["zip_code"]

# 将列名"movie_ids"改为"sequence_movie_ids"，将列名"ratings"改为"sequence_ratings"
ratings_data_transformed.rename(columns={"movie_ids": "sequence_movie_ids", "ratings": "sequence_ratings"}, inplace=True)

ratings_data_transformed.head()

Unnamed: 0,user_id,sequence_movie_ids,sequence_ratings,sex,age_group,occupation
0,user_1,"movie_3186,movie_1721,movie_1270,movie_1022","4.0,4.0,5.0,5.0",F,group_1,occupation_10
1,user_1,"movie_1270,movie_1022,movie_2340,movie_1836","5.0,5.0,3.0,5.0",F,group_1,occupation_10
2,user_1,"movie_2340,movie_1836,movie_3408,movie_1207","3.0,5.0,4.0,4.0",F,group_1,occupation_10
3,user_1,"movie_3408,movie_1207,movie_2804,movie_260","4.0,4.0,5.0,4.0",F,group_1,occupation_10
4,user_1,"movie_2804,movie_260,movie_720,movie_1193","5.0,4.0,3.0,5.0",F,group_1,occupation_10


In [22]:

# 生成一个与ratings_data_transformed.index长度相同的随机数数组，每个元素都是0到1之间的随机数
random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.85

# 根据随机数数组，选择85%的数据作为训练数据
train_data = ratings_data_transformed[random_selection]

# 根据随机数数组，选择15%的数据作为测试数据
test_data = ratings_data_transformed[~random_selection]

# # 将训练数据保存为CSV文件，不包含索引列，使用竖线作为分隔符，不包含表头
# train_data.to_csv("train_data.csv", index=False, sep="|", header=False)

# # 将测试数据保存为CSV文件，不包含索引列，使用竖线作为分隔符，不包含表头
# test_data.to_csv("test_data.csv", index=False, sep="|", header=False)
