In [23]:
import pandas as pd
import numpy as np

In [24]:
def data_preprocess(data_path:str)->pd.DataFrame:
    data_raw = pd.read_csv(data_path)   # 读取数据
    data_raw = data_raw.iloc[:, :3] # 去掉时间戳
    return data_raw

In [25]:
data_raw = data_preprocess("ratings.csv")
data_raw

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [26]:
# 将movieId出现最多的十部电影的movieId取出
movieId_top10 = data_raw["movieId"].value_counts().head(10).index.tolist()
movieId_top10


[356, 318, 296, 593, 2571, 260, 480, 110, 589, 527]

In [27]:
# 将看过这十部电影的用户取出
users = []
for movieId in movieId_top10:
    user = data_raw[data_raw['movieId']==movieId]['userId'].to_list()
    users.append(user)


In [28]:
# 将十组用户取交集
intersection_user = set(users[0])
for i in range(len(users)-1):
    intersection_user = intersection_user.intersection(set(users[i+1]))


In [29]:
# 得到10个用户和10部电影，并且都有评分
intersection_user = sorted(list(intersection_user)[:10])
intersection_user, movieId_top10

([17, 18, 28, 137, 274, 288, 304, 391, 414, 425],
 [356, 318, 296, 593, 2571, 260, 480, 110, 589, 527])

In [30]:
matrix = []
for user in intersection_user:
    array = []
    for movieId in movieId_top10:
        rating = data_raw[(data_raw['userId']==user) & (data_raw['movieId']==movieId)]['rating'].values[0]
        array.append(rating)
    matrix.append(array)

In [31]:
matrix

[[5.0, 5.0, 5.0, 4.5, 5.0, 5.0, 4.5, 4.5, 4.5, 4.5],
 [4.5, 5.0, 4.0, 4.5, 4.5, 4.0, 3.5, 4.5, 4.5, 4.5],
 [4.0, 3.5, 4.5, 2.5, 4.0, 4.0, 2.5, 3.5, 4.5, 3.0],
 [3.5, 3.5, 3.0, 4.0, 4.0, 4.0, 3.5, 4.0, 3.5, 3.5],
 [4.5, 4.5, 5.0, 4.0, 4.0, 3.0, 3.5, 4.5, 4.5, 4.0],
 [5.0, 5.0, 5.0, 5.0, 3.0, 5.0, 2.0, 5.0, 4.0, 5.0],
 [5.0, 4.0, 5.0, 4.0, 4.0, 5.0, 4.0, 5.0, 5.0, 5.0],
 [5.0, 5.0, 5.0, 3.0, 5.0, 5.0, 2.0, 5.0, 4.0, 5.0],
 [5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 4.0, 5.0, 5.0, 4.0],
 [5.0, 5.0, 4.0, 4.0, 3.5, 3.0, 3.5, 4.0, 3.5, 4.0]]

In [32]:
# 构造一个dataframe
df = pd.DataFrame(matrix, index=intersection_user, columns=movieId_top10)
df

Unnamed: 0,356,318,296,593,2571,260,480,110,589,527
17,5.0,5.0,5.0,4.5,5.0,5.0,4.5,4.5,4.5,4.5
18,4.5,5.0,4.0,4.5,4.5,4.0,3.5,4.5,4.5,4.5
28,4.0,3.5,4.5,2.5,4.0,4.0,2.5,3.5,4.5,3.0
137,3.5,3.5,3.0,4.0,4.0,4.0,3.5,4.0,3.5,3.5
274,4.5,4.5,5.0,4.0,4.0,3.0,3.5,4.5,4.5,4.0
288,5.0,5.0,5.0,5.0,3.0,5.0,2.0,5.0,4.0,5.0
304,5.0,4.0,5.0,4.0,4.0,5.0,4.0,5.0,5.0,5.0
391,5.0,5.0,5.0,3.0,5.0,5.0,2.0,5.0,4.0,5.0
414,5.0,5.0,5.0,4.0,5.0,5.0,4.0,5.0,5.0,4.0
425,5.0,5.0,4.0,4.0,3.5,3.0,3.5,4.0,3.5,4.0


In [33]:
# 写入文件
df.to_csv("svd-raw-data.csv")